From c291ffdecd09aeac1ddb639c880f245afa63a179 Mon Sep 17 00:00:00 2001 From: Alexey Fomenko Date: Tue, 20 Aug 2024 19:44:17 +0300 Subject: [PATCH 1/3] Add HPA support to tei, teireranking, tgi services Signed-off-by: Alexey Fomenko --- helm-charts/chatqna/README.md | 40 +++- .../chatqna/templates/customMetrics.yaml | 53 +++++ helm-charts/chatqna/values.yaml | 8 + helm-charts/common/tei/README.md | 74 ++++++- .../common/tei/templates/deployment.yaml | 7 + .../templates/horizontalPodAutoscaler.yaml | 51 +++++ .../common/tei/templates/servicemonitor.yaml | 17 ++ helm-charts/common/tei/values.yaml | 9 + helm-charts/common/teirerank/README.md | 74 ++++++- .../teirerank/templates/deployment.yaml | 7 + .../templates/horizontalPodAutoscaler.yaml | 51 +++++ .../teirerank/templates/servicemonitor.yaml | 17 ++ helm-charts/common/teirerank/values.yaml | 10 + helm-charts/common/tgi/README.md | 62 ++++++ .../common/tgi/templates/deployment.yaml | 7 + .../templates/horizontalPorAutoscaler.yaml | 51 +++++ .../common/tgi/templates/servicemonitor.yaml | 22 ++ helm-charts/common/tgi/values.yaml | 9 + .../config/HPA/customMetrics.yaml | 51 +++++ microservices-connector/config/HPA/tei.yaml | 206 ++++++++++++++++++ .../config/HPA/teirerank.yaml | 205 +++++++++++++++++ microservices-connector/config/HPA/tgi.yaml | 202 +++++++++++++++++ 22 files changed, 1216 insertions(+), 17 deletions(-) create mode 100644 helm-charts/chatqna/templates/customMetrics.yaml create mode 100644 helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/tei/templates/servicemonitor.yaml create mode 100644 helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/teirerank/templates/servicemonitor.yaml create mode 100644 helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml create mode 100644 helm-charts/common/tgi/templates/servicemonitor.yaml create mode 100644 microservices-connector/config/HPA/customMetrics.yaml create mode 100644 microservices-connector/config/HPA/tei.yaml create mode 100644 microservices-connector/config/HPA/teirerank.yaml create mode 100644 microservices-connector/config/HPA/tgi.yaml diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index e0a2f9e9..de70d662 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -34,6 +34,35 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- 1. Make sure your `MODELDIR` exists on the node where your workload is schedueled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the TGI and TEI inferencing deployments: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are running. @@ -83,8 +112,9 @@ Access `http://localhost:5174` to play with the ChatQnA workload through UI. ## Values -| Key | Type | Default | Description | -| ---------------- | ------ | ----------------------------- | ------------------------------------------------------------------------ | -| image.repository | string | `"opea/chatqna"` | | -| service.port | string | `"8888"` | | -| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| Key | Type | Default | Description | +| -------------------------------------- | ------ | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/chatqna"` | | +| service.port | string | `"8888"` | | +| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.horizontalPodAutoscaler.enabled | bop; | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See #pre-conditions and #gotchas before enabling! (If one doesn't want one of them to be scaled, given service `maxReplicas` can be set to `1`) | \ No newline at end of file diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml new file mode 100644 index 00000000..64123df0 --- /dev/null +++ b/helm-charts/chatqna/templates/customMetrics.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring +{{- end }} diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index f848b209..a7a115f9 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -48,3 +48,11 @@ global: modelUseHostPath: "" # modelUseHostPath: /mnt/opea-models # modelUsePVC: model-volume + + # Enabling HorizontalPodAutoscaler (HPA) will: + # - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries + # for embedding, reranking, tgi services + # Upstream default configMap: + # - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml + horizontalPodAutoscaler: + enabled: false diff --git a/helm-charts/common/tei/README.md b/helm-charts/common/tei/README.md index 14d647f4..89174634 100644 --- a/helm-charts/common/tei/README.md +++ b/helm-charts/common/tei/README.md @@ -21,6 +21,38 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-base-en-v1.5" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +so that relevant custom metric queries are configured for PrometheusAdapter. + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -33,11 +65,41 @@ Open another terminal and run the following command to verify the service if wor curl http://localhost:2081/embed -X POST -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tei +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do + kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values -| Key | Type | Default | Description | -| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| EMBEDDING_MODEL_ID | string | `"BAAI/bge-base-en-v1.5"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | -| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | -| image.tag | string | `"cpu-1.5"` | | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| EMBEDDING_MODEL_ID | string | `"BAAI/bge-base-en-v1.5"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | +| image.tag | string | `"cpu-1.5"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/tei/templates/deployment.yaml b/helm-charts/common/tei/templates/deployment.yaml index 7467b9ab..fe56355f 100644 --- a/helm-charts/common/tei/templates/deployment.yaml +++ b/helm-charts/common/tei/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "tei.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "tei.selectorLabels" . | nindent 6 }} @@ -102,3 +105,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} diff --git a/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 00000000..a448b96c --- /dev/null +++ b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tei.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tei.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "tei.fullname" . }} + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/tei/templates/servicemonitor.yaml b/helm-charts/common/tei/templates/servicemonitor.yaml new file mode 100644 index 00000000..05c25528 --- /dev/null +++ b/helm-charts/common/tei/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tei.fullname" . }} +spec: + selector: + matchLabels: + {{- include "tei.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: tei + scheme: http +{{- end }} diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml index a9edda93..387de250 100644 --- a/helm-charts/common/tei/values.yaml +++ b/helm-charts/common/tei/values.yaml @@ -7,6 +7,9 @@ replicaCount: 1 +horizontalPodAutoscaler: + maxReplicas: 2 + port: 2081 shmSize: 1Gi EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" @@ -92,3 +95,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md index b3cb2f19..a74079e0 100644 --- a/helm-charts/common/teirerank/README.md +++ b/helm-charts/common/teirerank/README.md @@ -21,6 +21,38 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-reranker-base" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +so that relevant custom metric queries are configured for PrometheusAdapter. + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -36,11 +68,41 @@ curl http://localhost:2082/rerank \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*rerank +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do + kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values -| Key | Type | Default | Description | -| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| RERANK_MODEL_ID | string | `"BAAI/bge-reranker-base"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | -| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | -| image.tag | string | `"cpu-1.5"` | | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| RERANK_MODEL_ID | string | `"BAAI/bge-reranker-base"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | +| image.tag | string | `"cpu-1.5"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml index 4a85b7fc..45d2cc95 100644 --- a/helm-charts/common/teirerank/templates/deployment.yaml +++ b/helm-charts/common/teirerank/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "teirerank.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "teirerank.selectorLabels" . | nindent 6 }} @@ -102,3 +105,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} diff --git a/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 00000000..bb249305 --- /dev/null +++ b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "teirerank.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "teirerank.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "teirerank.fullname" . }} + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/teirerank/templates/servicemonitor.yaml b/helm-charts/common/teirerank/templates/servicemonitor.yaml new file mode 100644 index 00000000..52d355a7 --- /dev/null +++ b/helm-charts/common/teirerank/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "teirerank.fullname" . }} +spec: + selector: + matchLabels: + {{- include "teirerank.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: teirerank + scheme: http +{{- end }} diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml index 80a4cf73..01537c70 100644 --- a/helm-charts/common/teirerank/values.yaml +++ b/helm-charts/common/teirerank/values.yaml @@ -7,6 +7,10 @@ replicaCount: 1 + +horizontalPodAutoscaler: + maxReplicas: 3 + port: 2082 shmSize: 1Gi RERANK_MODEL_ID: "BAAI/bge-reranker-base" @@ -92,3 +96,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false diff --git a/helm-charts/common/tgi/README.md b/helm-charts/common/tgi/README.md index 62e4d70c..03f73f74 100644 --- a/helm-charts/common/tgi/README.md +++ b/helm-charts/common/tgi/README.md @@ -24,6 +24,38 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/models--bigscience--bloom-560m" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +so that relevant custom metric queries are configured for PrometheusAdapter. + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -39,6 +71,35 @@ curl http://localhost:2080/generate \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tgi +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do + kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values | Key | Type | Default | Description | @@ -48,3 +109,4 @@ curl http://localhost:2080/generate \ | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"ghcr.io/huggingface/text-generation-inference"` | | | image.tag | string | `"1.4"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml index 2ef224b5..9587bcae 100644 --- a/helm-charts/common/tgi/templates/deployment.yaml +++ b/helm-charts/common/tgi/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "tgi.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "tgi.selectorLabels" . | nindent 6 }} @@ -94,3 +97,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 + {{- end }} diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml new file mode 100644 index 00000000..1131bbdc --- /dev/null +++ b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tgi.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tgi.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "tgi.fullname" . }} + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/tgi/templates/servicemonitor.yaml b/helm-charts/common/tgi/templates/servicemonitor.yaml new file mode 100644 index 00000000..0d7d6ffb --- /dev/null +++ b/helm-charts/common/tgi/templates/servicemonitor.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tgi.fullname" . }} +spec: + selector: + matchLabels: + {{- include "tgi.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: tgi + scheme: http +{{- end }} diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml index 659f0746..9aa6bae5 100644 --- a/helm-charts/common/tgi/values.yaml +++ b/helm-charts/common/tgi/values.yaml @@ -7,6 +7,9 @@ replicaCount: 1 +horizontalPodAutoscaler: + maxReplicas: 6 + port: 2080 image: @@ -117,3 +120,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false diff --git a/microservices-connector/config/HPA/customMetrics.yaml b/microservices-connector/config/HPA/customMetrics.yaml new file mode 100644 index 00000000..edf7d295 --- /dev/null +++ b/microservices-connector/config/HPA/customMetrics.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="release-name-tgi"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{service="release-name-tgi",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="release-name-tgi",<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="release-name-teirerank"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="release-name-teirerank",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="release-name-teirerank",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="release-name-tei"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="release-name-tei",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="release-name-tei",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring + diff --git a/microservices-connector/config/HPA/tei.yaml b/microservices-connector/config/HPA/tei.yaml new file mode 100644 index 00000000..f5fc5725 --- /dev/null +++ b/microservices-connector/config/HPA/tei.yaml @@ -0,0 +1,206 @@ +--- +# Source: tei/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tei-config + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-base-en-v1.5" + PORT: "2081" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + MAX_WARMUP_SEQUENCE_LENGTH: "512" +--- +# Source: tei/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: tei + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2081 + protocol: TCP + name: tei + selector: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei +--- +# Source: tei/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tei + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + template: + metadata: + labels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + spec: + securityContext: + {} + containers: + - name: tei + envFrom: + - configMapRef: + name: tei-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2081 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 +--- +# Source: tei/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tei +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tei + minReplicas: 1 + maxReplicas: 2 + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: tei/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tei +spec: + selector: + matchLabels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + endpoints: + - interval: 4s + port: tei + scheme: http diff --git a/microservices-connector/config/HPA/teirerank.yaml b/microservices-connector/config/HPA/teirerank.yaml new file mode 100644 index 00000000..181e8b2c --- /dev/null +++ b/microservices-connector/config/HPA/teirerank.yaml @@ -0,0 +1,205 @@ +--- +# Source: teirerank/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: teirerank-config + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-reranker-base" + PORT: "2082" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" +--- +# Source: teirerank/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: teirerank + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2082 + protocol: TCP + name: teirerank + selector: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank +--- +# Source: teirerank/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: teirerank + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + template: + metadata: + labels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + spec: + securityContext: + {} + containers: + - name: teirerank + envFrom: + - configMapRef: + name: teirerank-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2082 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 +--- +# Source: teirerank/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: teirerank +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: teirerank + minReplicas: 1 + maxReplicas: 3 + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: teirerank + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: teirerank/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: teirerank +spec: + selector: + matchLabels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + endpoints: + - interval: 4s + port: teirerank + scheme: http diff --git a/microservices-connector/config/HPA/tgi.yaml b/microservices-connector/config/HPA/tgi.yaml new file mode 100644 index 00000000..aa047b37 --- /dev/null +++ b/microservices-connector/config/HPA/tgi.yaml @@ -0,0 +1,202 @@ +--- +# Source: tgi/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tgi-config + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "Intel/neural-chat-7b-v3-3" + PORT: "2080" + HF_TOKEN: "insert-your-huggingface-token-here" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + CUDA_GRAPHS: "0" +--- +# Source: tgi/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: tgi + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2080 + protocol: TCP + name: tgi + selector: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi +--- +# Source: tgi/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tgi + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + template: + metadata: + labels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + spec: + securityContext: + {} + containers: + - name: tgi + envFrom: + - configMapRef: + name: tgi-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-generation-inference:2.2.0" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2080 + protocol: TCP + livenessProbe: + failureThreshold: 24 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 +--- +# Source: tgi/templates/horizontalPorAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tgi +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tgi + minReplicas: 1 + maxReplicas: 6 + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tgi + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: tgi/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tgi +spec: + selector: + matchLabels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + endpoints: + - interval: 4s + port: tgi + scheme: http From 3f971884cf26677756821bb9b5e55332207c040a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Aug 2024 16:17:37 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 12 ++++++------ helm-charts/common/tei/README.md | 2 +- helm-charts/common/teirerank/README.md | 2 +- helm-charts/common/tgi/README.md | 2 +- .../config/HPA/customMetrics.yaml | 1 - 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index de70d662..28191e6f 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -112,9 +112,9 @@ Access `http://localhost:5174` to play with the ChatQnA workload through UI. ## Values -| Key | Type | Default | Description | -| -------------------------------------- | ------ | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| image.repository | string | `"opea/chatqna"` | | -| service.port | string | `"8888"` | | -| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.horizontalPodAutoscaler.enabled | bop; | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See #pre-conditions and #gotchas before enabling! (If one doesn't want one of them to be scaled, given service `maxReplicas` can be set to `1`) | \ No newline at end of file +| Key | Type | Default | Description | +| -------------------------------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/chatqna"` | | +| service.port | string | `"8888"` | | +| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.horizontalPodAutoscaler.enabled | bop; | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See #pre-conditions and #gotchas before enabling! (If one doesn't want one of them to be scaled, given service `maxReplicas` can be set to `1`) | diff --git a/helm-charts/common/tei/README.md b/helm-charts/common/tei/README.md index 89174634..095e33f9 100644 --- a/helm-charts/common/tei/README.md +++ b/helm-charts/common/tei/README.md @@ -34,7 +34,7 @@ If cluster does not run [Prometheus operator](https://github.com/prometheus-oper yet, it SHOULD be be installed before enabling HPA, e.g. by using: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack -`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), so that relevant custom metric queries are configured for PrometheusAdapter. ### Gotchas diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md index a74079e0..e1c8d216 100644 --- a/helm-charts/common/teirerank/README.md +++ b/helm-charts/common/teirerank/README.md @@ -34,7 +34,7 @@ If cluster does not run [Prometheus operator](https://github.com/prometheus-oper yet, it SHOULD be be installed before enabling HPA, e.g. by using: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack -`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), so that relevant custom metric queries are configured for PrometheusAdapter. ### Gotchas diff --git a/helm-charts/common/tgi/README.md b/helm-charts/common/tgi/README.md index 03f73f74..23a9e664 100644 --- a/helm-charts/common/tgi/README.md +++ b/helm-charts/common/tgi/README.md @@ -37,7 +37,7 @@ If cluster does not run [Prometheus operator](https://github.com/prometheus-oper yet, it SHOULD be be installed before enabling HPA, e.g. by using: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack -`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), so that relevant custom metric queries are configured for PrometheusAdapter. ### Gotchas diff --git a/microservices-connector/config/HPA/customMetrics.yaml b/microservices-connector/config/HPA/customMetrics.yaml index edf7d295..3709e578 100644 --- a/microservices-connector/config/HPA/customMetrics.yaml +++ b/microservices-connector/config/HPA/customMetrics.yaml @@ -48,4 +48,3 @@ kind: ConfigMap metadata: name: adapter-config namespace: monitoring - From 2b89cd36b682e329a0630e7d9df9c31906ffb3b9 Mon Sep 17 00:00:00 2001 From: Alexey Fomenko Date: Fri, 23 Aug 2024 12:18:21 +0300 Subject: [PATCH 3/3] Consolidate HPA documentation Signed-off-by: Alexey Fomenko --- helm-charts/README.md | 67 ++++++++++++++++++++++++++ helm-charts/chatqna/README.md | 41 +++------------- helm-charts/common/tei/README.md | 63 +----------------------- helm-charts/common/teirerank/README.md | 63 +----------------------- helm-charts/common/tgi/README.md | 63 +----------------------- 5 files changed, 76 insertions(+), 221 deletions(-) diff --git a/helm-charts/README.md b/helm-charts/README.md index 2bec0cf2..c4eef858 100644 --- a/helm-charts/README.md +++ b/helm-charts/README.md @@ -9,6 +9,10 @@ This directory contains helm charts for [GenAIComps](https://github.com/opea-pro - [Components](#components) - [How to deploy with helm charts](#deploy-with-helm-charts) - [Helm Charts Options](#helm-charts-options) +- [HorizontalPodAutoscaler (HPA) support](#horizontalpodautoscaler-hpa-support) + - [Pre-conditions](#pre-conditions) + - [Gotchas](#gotchas) + - [Verify HPA metrics](#verify-hpa-metrics) - [Using Persistent Volume](#using-persistent-volume) - [Using Private Docker Hub](#using-private-docker-hub) - [Helm Charts repository](#helm-chart-repository) @@ -62,8 +66,71 @@ There are global options(which should be shared across all components of a workl | global | http_proxy https_proxy no_proxy | Proxy settings. If you are running the workloads behind the proxy, you'll have to add your proxy settings here. | | global | modelUsePVC | The PersistentVolumeClaim you want to use as huggingface hub cache. Default "" means not using PVC. Only one of modelUsePVC/modelUseHostPath can be set. | | global | modelUseHostPath | If you don't have Persistent Volume in your k8s cluster and want to use local directory as huggingface hub cache, set modelUseHostPath to your local directory name. Note that this can't share across nodes. Default "". Only one of modelUsePVC/modelUseHostPath can be set. | +| global | horizontalPodAutoscaler.enabled | Enable HPA autoscaling for TGI and TEI service deployments based on metrics they provide. See #pre-conditions and #gotchas before enabling! | | tgi | LLM_MODEL_ID | The model id you want to use for tgi server. Default "Intel/neural-chat-7b-v3-3". | +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the TGI and TEI inferencing deployments: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +Enabling HPA in top-level Helm chart (e.g. `chatqna`), overwrites cluster's current _PrometheusAdapter_ +configuration with relevant custom metric queries. If that has queries you wish to retain, _or_ HPA is +otherwise enabled only in TGI or TEI subchart(s), you need add relevat queries to _PrometheusAdapter_ +configuration _manually_ (e.g. from `chatqna` custom metrics Helm template). + +### Gotchas + +Why HPA is opt-in: + +- Enabling (top level) chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Current HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + performance (underlying HW, used models and data types, OPEA version etc) + +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check following... + +Prometheus has found the metric endpoints, i.e. last number on `curl` output is non-zero: + +```console +chart=chatqna; # OPEA services prefix +ns=monitoring; # Prometheus namespace +prom_url=http://$(kubectl -n $ns get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s); +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*$chart +``` + +**NOTE**: TGI and TEI inferencing services provide metrics endpoint only after they've processed their first request! + +PrometheusAdapter lists TGI and/or TGI custom metrics (`te_*` / `tgi_*`): + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +HPA rules list valid (not ``) TARGET values for service deployments: + +```console +ns=default; # OPEA namespace +kubectl -n $ns get hpa +``` + ## Using Persistent Volume It's common to use Persistent Volume(PV) for model caches(huggingface hub cache) in a production k8s cluster. We support to pass the PersistentVolumeClaim(PVC) to containers, but it's the user's responsibility to create the PVC depending on your k8s cluster's capability. diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 28191e6f..64f001eb 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -34,35 +34,6 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- 1. Make sure your `MODELDIR` exists on the node where your workload is schedueled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. -## HorizontalPodAutoscaler (HPA) support - -`horizontalPodAutoscaler` option enables HPA scaling for the TGI and TEI inferencing deployments: -https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ - -Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). - -### Pre-conditions - -If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) -yet, it SHOULD be be installed before enabling HPA, e.g. by using: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack - -### Gotchas - -Why HPA is opt-in: - -- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current - `PrometheusAdapter` configuration with its own custom metrics configuration. - Take copy of the existing one before install, if that matters: - `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` -- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: - `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` -- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) - for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is - asked to install OPEA services to some other namespace, those rules need to be updated accordingly -- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup - (underlying HW, used models, OPEA version etc) - ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are running. @@ -112,9 +83,9 @@ Access `http://localhost:5174` to play with the ChatQnA workload through UI. ## Values -| Key | Type | Default | Description | -| -------------------------------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| image.repository | string | `"opea/chatqna"` | | -| service.port | string | `"8888"` | | -| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.horizontalPodAutoscaler.enabled | bop; | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See #pre-conditions and #gotchas before enabling! (If one doesn't want one of them to be scaled, given service `maxReplicas` can be set to `1`) | +| Key | Type | Default | Description | +| -------------------------------------- | ------ | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/chatqna"` | | +| service.port | string | `"8888"` | | +| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.horizontalPodAutoscaler.enabled | bop; | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See HPA section in ../README.md before enabling! | diff --git a/helm-charts/common/tei/README.md b/helm-charts/common/tei/README.md index 095e33f9..9d9817ea 100644 --- a/helm-charts/common/tei/README.md +++ b/helm-charts/common/tei/README.md @@ -21,38 +21,6 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-base-en-v1.5" -## HorizontalPodAutoscaler (HPA) support - -`horizontalPodAutoscaler` option enables HPA scaling for the deployment: -https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ - -Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). - -### Pre-conditions - -If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) -yet, it SHOULD be be installed before enabling HPA, e.g. by using: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack - -`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), -so that relevant custom metric queries are configured for PrometheusAdapter. - -### Gotchas - -Why HPA is opt-in: - -- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current - `PrometheusAdapter` configuration with its own custom metrics configuration. - Take copy of the existing one before install, if that matters: - `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` -- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: - `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` -- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) - for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is - asked to install OPEA services to some other namespace, those rules need to be updated accordingly -- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup - (underlying HW, used models, OPEA version etc) - ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -65,35 +33,6 @@ Open another terminal and run the following command to verify the service if wor curl http://localhost:2081/embed -X POST -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json' ``` -### Verify HPA metrics - -To verify that metrics required by horizontalPodAutoscaler option work, check that: - -Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: - -```console -prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) -curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tei -``` - -Prometheus adapter provides custom metrics for their data: - -```console -kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name -``` - -And those custom metrics have valid values for HPA rules: - -```console -ns=default; # OPEA namespace -url=/apis/custom.metrics.k8s.io/v1beta1; -for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do - kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; -done | grep -e metricName -e value -``` - -NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! - ## Values | Key | Type | Default | Description | @@ -102,4 +41,4 @@ NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they' | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | | image.tag | string | `"cpu-1.5"` | | -| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md index e1c8d216..d445364f 100644 --- a/helm-charts/common/teirerank/README.md +++ b/helm-charts/common/teirerank/README.md @@ -21,38 +21,6 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-reranker-base" -## HorizontalPodAutoscaler (HPA) support - -`horizontalPodAutoscaler` option enables HPA scaling for the deployment: -https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ - -Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). - -### Pre-conditions - -If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) -yet, it SHOULD be be installed before enabling HPA, e.g. by using: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack - -`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), -so that relevant custom metric queries are configured for PrometheusAdapter. - -### Gotchas - -Why HPA is opt-in: - -- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current - `PrometheusAdapter` configuration with its own custom metrics configuration. - Take copy of the existing one before install, if that matters: - `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` -- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: - `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` -- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) - for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is - asked to install OPEA services to some other namespace, those rules need to be updated accordingly -- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup - (underlying HW, used models, OPEA version etc) - ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -68,35 +36,6 @@ curl http://localhost:2082/rerank \ -H 'Content-Type: application/json' ``` -### Verify HPA metrics - -To verify that metrics required by horizontalPodAutoscaler option work, check that: - -Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: - -```console -prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) -curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*rerank -``` - -Prometheus adapter provides custom metrics for their data: - -```console -kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name -``` - -And those custom metrics have valid values for HPA rules: - -```console -ns=default; # OPEA namespace -url=/apis/custom.metrics.k8s.io/v1beta1; -for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do - kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; -done | grep -e metricName -e value -``` - -NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! - ## Values | Key | Type | Default | Description | @@ -105,4 +44,4 @@ NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they' | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | | image.tag | string | `"cpu-1.5"` | | -| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | diff --git a/helm-charts/common/tgi/README.md b/helm-charts/common/tgi/README.md index 23a9e664..0100378f 100644 --- a/helm-charts/common/tgi/README.md +++ b/helm-charts/common/tgi/README.md @@ -24,38 +24,6 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/models--bigscience--bloom-560m" -## HorizontalPodAutoscaler (HPA) support - -`horizontalPodAutoscaler` option enables HPA scaling for the deployment: -https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ - -Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). - -### Pre-conditions - -If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) -yet, it SHOULD be be installed before enabling HPA, e.g. by using: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack - -`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), -so that relevant custom metric queries are configured for PrometheusAdapter. - -### Gotchas - -Why HPA is opt-in: - -- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current - `PrometheusAdapter` configuration with its own custom metrics configuration. - Take copy of the existing one before install, if that matters: - `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` -- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: - `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` -- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) - for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is - asked to install OPEA services to some other namespace, those rules need to be updated accordingly -- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup - (underlying HW, used models, OPEA version etc) - ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -71,35 +39,6 @@ curl http://localhost:2080/generate \ -H 'Content-Type: application/json' ``` -### Verify HPA metrics - -To verify that metrics required by horizontalPodAutoscaler option work, check that: - -Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: - -```console -prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) -curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tgi -``` - -Prometheus adapter provides custom metrics for their data: - -```console -kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name -``` - -And those custom metrics have valid values for HPA rules: - -```console -ns=default; # OPEA namespace -url=/apis/custom.metrics.k8s.io/v1beta1; -for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do - kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; -done | grep -e metricName -e value -``` - -NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! - ## Values | Key | Type | Default | Description | @@ -109,4 +48,4 @@ NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they' | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"ghcr.io/huggingface/text-generation-inference"` | | | image.tag | string | `"1.4"` | | -| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! |