-
Notifications
You must be signed in to change notification settings - Fork 52
/
values.yaml
142 lines (120 loc) · 3.58 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Default values for tgi.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
maxReplicas: 4
enabled: false
port: 2080
shmSize: 1Gi
# Set extraCmdArgs if you need to pass additional parameters to TGI for performance
# Refer to https://huggingface.co/docs/text-generation-inference/en/reference/launcher for more options.
# extraCmdArgs: ["--dtype","bfloat16"]
image:
repository: ghcr.io/huggingface/text-generation-inference
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
# `sha-e4201f4-intel-cpu` is the image tag for intel cpu optimized tgi image
tag: "sha-e4201f4-intel-cpu"
# empty for CPU
accelDevice: ""
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
podAnnotations: {}
podSecurityContext: {}
# fsGroup: 2000
securityContext:
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
service:
type: ClusterIP
resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
# resources, such as Minikube. If you do want to specify resources, uncomment the following
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Use TCP probe instead of HTTP due to bug #483
# https://github.com/opea-project/GenAIExamples/issues/483
livenessProbe:
tcpSocket:
port: http
initialDelaySeconds: 8
periodSeconds: 8
timeoutSeconds: 4
failureThreshold: 24
readinessProbe:
tcpSocket:
port: http
initialDelaySeconds: 16
periodSeconds: 8
timeoutSeconds: 4
startupProbe:
tcpSocket:
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 180
timeoutSeconds: 2
#livenessProbe:
# httpGet:
# path: /health
# port: http
# initialDelaySeconds: 5
# periodSeconds: 5
# failureThreshold: 24
#readinessProbe:
# httpGet:
# path: /health
# port: http
# initialDelaySeconds: 5
# periodSeconds: 5
#startupProbe:
# httpGet:
# path: /health
# port: http
# initialDelaySeconds: 5
# periodSeconds: 5
# failureThreshold: 120
nodeSelector: {}
tolerations: []
affinity: {}
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
MAX_INPUT_LENGTH: ""
MAX_TOTAL_TOKENS: ""
CUDA_GRAPHS: "0"
global:
http_proxy: ""
https_proxy: ""
no_proxy: ""
HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
# Choose where to save your downloaded models
# Set modelUseHostPath for local directory, this is good for one node test. Example:
# modelUseHostPath: /mnt/opea-models
# Set modelUsePVC for PersistentVolumeClaim(PVC), which is suitable for multinode deployment. Example:
# modelUsePVC: model-volume
# You can only set one of the following var, the behavior is not defined is both are set.
# By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
modelUseHostPath: ""
modelUsePVC: ""
# Prometheus Helm installation info for serviceMonitor
prometheusRelease: prometheus-stack