From b49bdeace87495a92bf03ef3a69e0fa34479aef4 Mon Sep 17 00:00:00 2001
From: Alexey Fomenko <alexey.fomenko@intel.com>
Date: Tue, 20 Aug 2024 19:44:17 +0300
Subject: [PATCH] Add HPA support to tei, teireranking, tgi services

Signed-off-by: Alexey Fomenko <alexey.fomenko@intel.com>
---
 .../chatqna/templates/customMetrics.yaml      |  53 +++++
 helm-charts/chatqna/values.yaml               |   8 +
 helm-charts/common/tei/README.md              |  74 ++++++-
 .../common/tei/templates/deployment.yaml      |   7 +
 .../templates/horizontalPodAutoscaler.yaml    |  51 +++++
 .../common/tei/templates/servicemonitor.yaml  |  17 ++
 helm-charts/common/tei/values.yaml            |   9 +
 helm-charts/common/teirerank/README.md        |  74 ++++++-
 .../teirerank/templates/deployment.yaml       |   7 +
 .../templates/horizontalPodAutoscaler.yaml    |  51 +++++
 .../teirerank/templates/servicemonitor.yaml   |  17 ++
 helm-charts/common/teirerank/values.yaml      |  10 +
 helm-charts/common/tgi/README.md              |  62 ++++++
 .../common/tgi/templates/deployment.yaml      |   7 +
 .../templates/horizontalPorAutoscaler.yaml    |  51 +++++
 .../common/tgi/templates/servicemonitor.yaml  |  22 ++
 helm-charts/common/tgi/values.yaml            |   9 +
 .../config/HPA/customMetrics.yaml             |  51 +++++
 microservices-connector/config/HPA/tei.yaml   | 205 ++++++++++++++++++
 .../config/HPA/teirerank.yaml                 | 204 +++++++++++++++++
 microservices-connector/config/HPA/tgi.yaml   | 201 +++++++++++++++++
 21 files changed, 1178 insertions(+), 12 deletions(-)
 create mode 100644 helm-charts/chatqna/templates/customMetrics.yaml
 create mode 100644 helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml
 create mode 100644 helm-charts/common/tei/templates/servicemonitor.yaml
 create mode 100644 helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml
 create mode 100644 helm-charts/common/teirerank/templates/servicemonitor.yaml
 create mode 100644 helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml
 create mode 100644 helm-charts/common/tgi/templates/servicemonitor.yaml
 create mode 100644 microservices-connector/config/HPA/customMetrics.yaml
 create mode 100644 microservices-connector/config/HPA/tei.yaml
 create mode 100644 microservices-connector/config/HPA/teirerank.yaml
 create mode 100644 microservices-connector/config/HPA/tgi.yaml

diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml
new file mode 100644
index 00000000..64123df0
--- /dev/null
+++ b/helm-charts/chatqna/templates/customMetrics.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: v1
+data:
+  config.yaml: |
+      rules:
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
+        # Average request latency from TGI histograms, over 1 min
+        # (0.001 divider add is to make sure there's always a valid value)
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^tgi_request_inference_duration_sum
+          as: "tgi_request_latency"
+        resources:
+          # HPA needs both namespace + suitable object resource for its query paths:
+          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
+          # (pod is not suitable object type for matching as each instance has different name)
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "reranking_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "embedding_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+kind: ConfigMap
+metadata:
+  name: adapter-config
+  namespace: monitoring
+{{- end }}
diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
index f848b209..a7a115f9 100644
--- a/helm-charts/chatqna/values.yaml
+++ b/helm-charts/chatqna/values.yaml
@@ -48,3 +48,11 @@ global:
   modelUseHostPath: ""
   # modelUseHostPath: /mnt/opea-models
   # modelUsePVC: model-volume
+
+  # Enabling HorizontalPodAutoscaler (HPA) will:
+  # - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
+  #   for embedding, reranking, tgi services
+  # Upstream default configMap:
+  #  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/helm-charts/common/tei/README.md b/helm-charts/common/tei/README.md
index 14d647f4..89174634 100644
--- a/helm-charts/common/tei/README.md
+++ b/helm-charts/common/tei/README.md
@@ -21,6 +21,38 @@ MODELDIR=/mnt/opea-models
 
 MODELNAME="/data/BAAI/bge-base-en-v1.5"
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component  (e.g. `chatqna`),
+so that relevant custom metric queries are configured for PrometheusAdapter.
+
+### Gotchas
+
+Why HPA is opt-in:
+
+- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng.
@@ -33,11 +65,41 @@ Open another terminal and run the following command to verify the service if wor
 curl http://localhost:2081/embed -X POST -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json'
 ```
 
+### Verify HPA metrics
+
+To verify that metrics required by horizontalPodAutoscaler option work, check that:
+
+Prometheus has found the metric endpoints, i.e. last number on the line is non-zero:
+
+```console
+prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s)
+curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tei
+```
+
+Prometheus adapter provides custom metrics for their data:
+
+```console
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name
+```
+
+And those custom metrics have valid values for HPA rules:
+
+```console
+ns=default;  # OPEA namespace
+url=/apis/custom.metrics.k8s.io/v1beta1;
+for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do
+  kubectl get --raw $url/namespaces/$ns/metrics/$m | jq;
+done | grep -e metricName -e value
+```
+
+NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request!
+
 ## Values
 
-| Key                     | Type   | Default                                           | Description                                                                                                                                                                                                           |
-| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| EMBEDDING_MODEL_ID      | string | `"BAAI/bge-base-en-v1.5"`                         | Models id from https://huggingface.co/, or predownloaded model directory                                                                                                                                              |
-| global.modelUseHostPath | string | `"/mnt/opea-models"`                              | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
-| image.repository        | string | `"ghcr.io/huggingface/text-embeddings-inference"` |                                                                                                                                                                                                                       |
-| image.tag               | string | `"cpu-1.5"`                                       |                                                                                                                                                                                                                       |
+| Key                             | Type   | Default                                           | Description                                                                                                                                                                                                           |
+| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EMBEDDING_MODEL_ID              | string | `"BAAI/bge-base-en-v1.5"`                         | Models id from https://huggingface.co/, or predownloaded model directory                                                                                                                                              |
+| global.modelUseHostPath         | string | `"/mnt/opea-models"`                              | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
+| image.repository                | string | `"ghcr.io/huggingface/text-embeddings-inference"` |                                                                                                                                                                                                                       |
+| image.tag                       | string | `"cpu-1.5"`                                       |                                                                                                                                                                                                                       |
+| horizontalPodAutoscaler.enabled | bool   | false                                             | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling!                                                                                    |
diff --git a/helm-charts/common/tei/templates/deployment.yaml b/helm-charts/common/tei/templates/deployment.yaml
index 7467b9ab..fe56355f 100644
--- a/helm-charts/common/tei/templates/deployment.yaml
+++ b/helm-charts/common/tei/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "tei.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.global.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "tei.selectorLabels" . | nindent 6 }}
@@ -102,3 +105,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.global.horizontalPodAutoscaler.enabled }}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 60
+      {{- end }}
diff --git a/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml
new file mode 100644
index 00000000..a448b96c
--- /dev/null
+++ b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "tei.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tei.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-embedding time metrics are in seconds
+        name: embedding_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: {{ include "tei.fullname" . }}
+      target:
+        # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/tei/templates/servicemonitor.yaml b/helm-charts/common/tei/templates/servicemonitor.yaml
new file mode 100644
index 00000000..05c25528
--- /dev/null
+++ b/helm-charts/common/tei/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "tei.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "tei.selectorLabels" . | nindent 6 }}
+  endpoints:
+  - interval: 4s
+    port: tei
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml
index a9edda93..387de250 100644
--- a/helm-charts/common/tei/values.yaml
+++ b/helm-charts/common/tei/values.yaml
@@ -7,6 +7,9 @@
 
 replicaCount: 1
 
+horizontalPodAutoscaler:
+  maxReplicas: 2
+
 port: 2081
 shmSize: 1Gi
 EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
@@ -92,3 +95,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+  # Enabling HPA will:
+  # - Ignore above replica count, as it will be controlled by HPA
+  # - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+  # - Require custom metrics ConfigMap available in the main application chart
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md
index b3cb2f19..a74079e0 100644
--- a/helm-charts/common/teirerank/README.md
+++ b/helm-charts/common/teirerank/README.md
@@ -21,6 +21,38 @@ MODELDIR=/mnt/opea-models
 
 MODELNAME="/data/BAAI/bge-reranker-base"
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component  (e.g. `chatqna`),
+so that relevant custom metric queries are configured for PrometheusAdapter.
+
+### Gotchas
+
+Why HPA is opt-in:
+
+- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng.
@@ -36,11 +68,41 @@ curl http://localhost:2082/rerank \
     -H 'Content-Type: application/json'
 ```
 
+### Verify HPA metrics
+
+To verify that metrics required by horizontalPodAutoscaler option work, check that:
+
+Prometheus has found the metric endpoints, i.e. last number on the line is non-zero:
+
+```console
+prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s)
+curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*rerank
+```
+
+Prometheus adapter provides custom metrics for their data:
+
+```console
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name
+```
+
+And those custom metrics have valid values for HPA rules:
+
+```console
+ns=default;  # OPEA namespace
+url=/apis/custom.metrics.k8s.io/v1beta1;
+for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do
+  kubectl get --raw $url/namespaces/$ns/metrics/$m | jq;
+done | grep -e metricName -e value
+```
+
+NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request!
+
 ## Values
 
-| Key                     | Type   | Default                                           | Description                                                                                                                                                                                                                 |
-| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| RERANK_MODEL_ID         | string | `"BAAI/bge-reranker-base"`                        | Models id from https://huggingface.co/, or predownloaded model directory                                                                                                                                                    |
-| global.modelUseHostPath | string | `"/mnt/opea-models"`                              | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
-| image.repository        | string | `"ghcr.io/huggingface/text-embeddings-inference"` |                                                                                                                                                                                                                             |
-| image.tag               | string | `"cpu-1.5"`                                       |                                                                                                                                                                                                                             |
+| Key                             | Type   | Default                                           | Description                                                                                                                                                                                                                 |
+| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| RERANK_MODEL_ID                 | string | `"BAAI/bge-reranker-base"`                        | Models id from https://huggingface.co/, or predownloaded model directory                                                                                                                                                    |
+| global.modelUseHostPath         | string | `"/mnt/opea-models"`                              | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
+| image.repository                | string | `"ghcr.io/huggingface/text-embeddings-inference"` |                                                                                                                                                                                                                             |
+| image.tag                       | string | `"cpu-1.5"`                                       |                                                                                                                                                                                                                             |
+| horizontalPodAutoscaler.enabled | bool   | false                                             | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling!                                                                                          |
diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml
index 4a85b7fc..45d2cc95 100644
--- a/helm-charts/common/teirerank/templates/deployment.yaml
+++ b/helm-charts/common/teirerank/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "teirerank.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.global.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "teirerank.selectorLabels" . | nindent 6 }}
@@ -102,3 +105,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.global.horizontalPodAutoscaler.enabled }}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 60
+      {{- end }}
diff --git a/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml
new file mode 100644
index 00000000..bb249305
--- /dev/null
+++ b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "teirerank.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "teirerank.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-reranking time metrics are in seconds
+        name: reranking_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: {{ include "teirerank.fullname" . }}
+      target:
+        # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/teirerank/templates/servicemonitor.yaml b/helm-charts/common/teirerank/templates/servicemonitor.yaml
new file mode 100644
index 00000000..52d355a7
--- /dev/null
+++ b/helm-charts/common/teirerank/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "teirerank.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "teirerank.selectorLabels" . | nindent 6 }}
+  endpoints:
+  - interval: 4s
+    port: teirerank
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml
index 80a4cf73..01537c70 100644
--- a/helm-charts/common/teirerank/values.yaml
+++ b/helm-charts/common/teirerank/values.yaml
@@ -7,6 +7,10 @@
 
 replicaCount: 1
 
+
+horizontalPodAutoscaler:
+  maxReplicas: 3
+
 port: 2082
 shmSize: 1Gi
 RERANK_MODEL_ID: "BAAI/bge-reranker-base"
@@ -92,3 +96,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+  # Enabling HPA will:
+  # - Ignore above replica count, as it will be controlled by HPA
+  # - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+  # - Require custom metrics ConfigMap available in the main application chart
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/helm-charts/common/tgi/README.md b/helm-charts/common/tgi/README.md
index 62e4d70c..03f73f74 100644
--- a/helm-charts/common/tgi/README.md
+++ b/helm-charts/common/tgi/README.md
@@ -24,6 +24,38 @@ MODELDIR=/mnt/opea-models
 
 MODELNAME="/data/models--bigscience--bloom-560m"
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component  (e.g. `chatqna`),
+so that relevant custom metric queries are configured for PrometheusAdapter.
+
+### Gotchas
+
+Why HPA is opt-in:
+
+- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng.
@@ -39,6 +71,35 @@ curl http://localhost:2080/generate \
     -H 'Content-Type: application/json'
 ```
 
+### Verify HPA metrics
+
+To verify that metrics required by horizontalPodAutoscaler option work, check that:
+
+Prometheus has found the metric endpoints, i.e. last number on the line is non-zero:
+
+```console
+prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s)
+curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tgi
+```
+
+Prometheus adapter provides custom metrics for their data:
+
+```console
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name
+```
+
+And those custom metrics have valid values for HPA rules:
+
+```console
+ns=default;  # OPEA namespace
+url=/apis/custom.metrics.k8s.io/v1beta1;
+for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do
+  kubectl get --raw $url/namespaces/$ns/metrics/$m | jq;
+done | grep -e metricName -e value
+```
+
+NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request!
+
 ## Values
 
 | Key                             | Type   | Default                                           | Description                                                                                                                                                                                                           |
@@ -48,3 +109,4 @@ curl http://localhost:2080/generate \
 | global.modelUseHostPath         | string | `"/mnt/opea-models"`                              | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
 | image.repository                | string | `"ghcr.io/huggingface/text-generation-inference"` |                                                                                                                                                                                                                       |
 | image.tag                       | string | `"1.4"`                                           |                                                                                                                                                                                                                       |
+| horizontalPodAutoscaler.enabled | bool   | false                                             | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling!                                                                                    |
diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml
index 2ef224b5..9587bcae 100644
--- a/helm-charts/common/tgi/templates/deployment.yaml
+++ b/helm-charts/common/tgi/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "tgi.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.global.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "tgi.selectorLabels" . | nindent 6 }}
@@ -94,3 +97,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.global.horizontalPodAutoscaler.enabled }}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 120
+      {{- end }}
diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml
new file mode 100644
index 00000000..1131bbdc
--- /dev/null
+++ b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "tgi.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tgi.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # TGI time metrics are in seconds
+        name: tgi_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: {{ include "tgi.fullname" . }}
+      target:
+        # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
+        # TGI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/tgi/templates/servicemonitor.yaml b/helm-charts/common/tgi/templates/servicemonitor.yaml
new file mode 100644
index 00000000..0d7d6ffb
--- /dev/null
+++ b/helm-charts/common/tgi/templates/servicemonitor.yaml
@@ -0,0 +1,22 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed TGI metrics:
+# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
+# Metric descriptions:
+# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
+
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "tgi.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "tgi.selectorLabels" . | nindent 6 }}
+  endpoints:
+  - interval: 4s
+    port: tgi
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml
index 659f0746..9aa6bae5 100644
--- a/helm-charts/common/tgi/values.yaml
+++ b/helm-charts/common/tgi/values.yaml
@@ -7,6 +7,9 @@
 
 replicaCount: 1
 
+horizontalPodAutoscaler:
+  maxReplicas: 6
+
 port: 2080
 
 image:
@@ -117,3 +120,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+  # Enabling HPA will:
+  # - Ignore above replica count, as it will be controlled by HPA
+  # - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+  # - Require custom metrics ConfigMap available in the main application chart
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/microservices-connector/config/HPA/customMetrics.yaml b/microservices-connector/config/HPA/customMetrics.yaml
new file mode 100644
index 00000000..c3b5de05
--- /dev/null
+++ b/microservices-connector/config/HPA/customMetrics.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+data:
+  config.yaml: |
+      rules:
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
+        # Average request latency from TGI histograms, over 1 min
+        # (0.001 divider add is to make sure there's always a valid value)
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^tgi_request_inference_duration_sum
+          as: "tgi_request_latency"
+        resources:
+          # HPA needs both namespace + suitable object resource for its query paths:
+          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
+          # (pod is not suitable object type for matching as each instance has different name)
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "reranking_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "embedding_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+kind: ConfigMap
+metadata:
+  name: adapter-config
+  namespace: monitoring
diff --git a/microservices-connector/config/HPA/tei.yaml b/microservices-connector/config/HPA/tei.yaml
new file mode 100644
index 00000000..54c830e6
--- /dev/null
+++ b/microservices-connector/config/HPA/tei.yaml
@@ -0,0 +1,205 @@
+---
+# Source: tei/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tei-config
+  labels:
+    helm.sh/chart: tei-0.8.0
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: tei
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+data:
+  MODEL_ID: "BAAI/bge-base-en-v1.5"
+  PORT: "2081"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  NUMBA_CACHE_DIR: "/tmp"
+  TRANSFORMERS_CACHE: "/tmp/transformers_cache"
+  HF_HOME: "/tmp/.cache/huggingface"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+---
+# Source: tei/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: tei
+  labels:
+    helm.sh/chart: tei-0.8.0
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: tei
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2081
+      protocol: TCP
+      name: tei
+  selector:
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: tei
+---
+# Source: tei/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tei
+  labels:
+    helm.sh/chart: tei-0.8.0
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: tei
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tei
+      app.kubernetes.io/instance: tei
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: tei
+        app.kubernetes.io/instance: tei
+    spec:
+      securityContext:
+        {}
+      containers:
+        - name: tei
+          envFrom:
+            - configMapRef:
+                name: tei-config
+            - configMapRef:
+                name: extra-env-config
+                optional: true
+          securityContext:
+            {}
+          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
+          imagePullPolicy: IfNotPresent
+          args:
+            - "--auto-truncate"
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2081
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          startupProbe:
+            failureThreshold: 120
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          resources:
+            {}
+      volumes:
+        - name: model-volume
+          hostPath:
+            path: /mnt/opea-models
+            type: Directory
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+        - name: tmp
+          emptyDir: {}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 60
+---
+# Source: tei/templates/horizontalPodAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: tei
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: tei
+  minReplicas: 1
+  maxReplicas: 2
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-embedding time metrics are in seconds
+        name: embedding_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei
+      target:
+        # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+---
+# Source: tei/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: tei
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tei
+  endpoints:
+  - interval: 4s
+    port: tei
+    scheme: http
diff --git a/microservices-connector/config/HPA/teirerank.yaml b/microservices-connector/config/HPA/teirerank.yaml
new file mode 100644
index 00000000..3cd33c14
--- /dev/null
+++ b/microservices-connector/config/HPA/teirerank.yaml
@@ -0,0 +1,204 @@
+---
+# Source: teirerank/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: teirerank-config
+  labels:
+    helm.sh/chart: teirerank-0.8.0
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: teirerank
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+data:
+  MODEL_ID: "BAAI/bge-reranker-base"
+  PORT: "2082"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  NUMBA_CACHE_DIR: "/tmp"
+  TRANSFORMERS_CACHE: "/tmp/transformers_cache"
+  HF_HOME: "/tmp/.cache/huggingface"
+---
+# Source: teirerank/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: teirerank
+  labels:
+    helm.sh/chart: teirerank-0.8.0
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: teirerank
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2082
+      protocol: TCP
+      name: teirerank
+  selector:
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: teirerank
+---
+# Source: teirerank/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: teirerank
+  labels:
+    helm.sh/chart: teirerank-0.8.0
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: teirerank
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: teirerank
+      app.kubernetes.io/instance: teirerank
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: teirerank
+        app.kubernetes.io/instance: teirerank
+    spec:
+      securityContext:
+        {}
+      containers:
+        - name: teirerank
+          envFrom:
+            - configMapRef:
+                name: teirerank-config
+            - configMapRef:
+                name: extra-env-config
+                optional: true
+          securityContext:
+            {}
+          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
+          imagePullPolicy: IfNotPresent
+          args:
+            - "--auto-truncate"
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2082
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          startupProbe:
+            failureThreshold: 120
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          resources:
+            {}
+      volumes:
+        - name: model-volume
+          hostPath:
+            path: /mnt/opea-models
+            type: Directory
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+        - name: tmp
+          emptyDir: {}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 60
+---
+# Source: teirerank/templates/horizontalPodAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: teirerank
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: teirerank
+  minReplicas: 1
+  maxReplicas: 3
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-reranking time metrics are in seconds
+        name: reranking_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: teirerank
+      target:
+        # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+---
+# Source: teirerank/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: teirerank
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: teirerank
+  endpoints:
+  - interval: 4s
+    port: teirerank
+    scheme: http
diff --git a/microservices-connector/config/HPA/tgi.yaml b/microservices-connector/config/HPA/tgi.yaml
new file mode 100644
index 00000000..97aedc76
--- /dev/null
+++ b/microservices-connector/config/HPA/tgi.yaml
@@ -0,0 +1,201 @@
+---
+# Source: tgi/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tgi-config
+  labels:
+    helm.sh/chart: tgi-0.8.0
+    app.kubernetes.io/name: tgi
+    app.kubernetes.io/instance: tgi
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+data:
+  MODEL_ID: "Intel/neural-chat-7b-v3-3"
+  PORT: "2080"
+  HF_TOKEN: "insert-your-huggingface-token-here"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  HABANA_LOGS: "/tmp/habana_logs"
+  NUMBA_CACHE_DIR: "/tmp"
+  TRANSFORMERS_CACHE: "/tmp/transformers_cache"
+  HF_HOME: "/tmp/.cache/huggingface"
+  CUDA_GRAPHS: "0"
+---
+# Source: tgi/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: tgi
+  labels:
+    helm.sh/chart: tgi-0.8.0
+    app.kubernetes.io/name: tgi
+    app.kubernetes.io/instance: tgi
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2080
+      protocol: TCP
+      name: tgi
+  selector:
+    app.kubernetes.io/name: tgi
+    app.kubernetes.io/instance: tgi
+---
+# Source: tgi/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tgi
+  labels:
+    helm.sh/chart: tgi-0.8.0
+    app.kubernetes.io/name: tgi
+    app.kubernetes.io/instance: tgi
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tgi
+      app.kubernetes.io/instance: tgi
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: tgi
+        app.kubernetes.io/instance: tgi
+    spec:
+      securityContext:
+        {}
+      containers:
+        - name: tgi
+          envFrom:
+            - configMapRef:
+                name: tgi-config
+            - configMapRef:
+                name: extra-env-config
+                optional: true
+          securityContext:
+            {}
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
+          imagePullPolicy: IfNotPresent
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2080
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          readinessProbe:
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          startupProbe:
+            failureThreshold: 120
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          resources:
+            {}
+      volumes:
+        - name: model-volume
+          hostPath:
+            path: /mnt/opea-models
+            type: Directory
+        - name: tmp
+          emptyDir: {}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 120
+---
+# Source: tgi/templates/horizontalPorAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: tgi
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: tgi
+  minReplicas: 1
+  maxReplicas: 6
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # TGI time metrics are in seconds
+        name: tgi_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tgi
+      target:
+        # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
+        # TGI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+---
+# Source: tgi/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed TGI metrics:
+# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
+# Metric descriptions:
+# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: tgi
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tgi
+  endpoints:
+  - interval: 4s
+    port: tgi
+    scheme: http