Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RUN-20790 Impersonate dummy dcgm exporter on fake nodes #103

Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ containers:
value: "{{ .Release.Namespace }}"
- name: TOPOLOGY_MAX_EXPORT_INTERVAL
value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
- name: FAKE_GPU_OPERATOR_NAMESPACE
value: "{{ .Release.Namespace }}"
ports:
- containerPort: 9400
name: http
Expand All @@ -49,10 +51,6 @@ restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
serviceAccountName: status-exporter
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,10 @@ spec:
privileged: true
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
{{- if .Values.kwok.tolerations }}
{{ .Values.kwok.tolerations | toYaml | nindent 6 }}
{{- end }}
18 changes: 18 additions & 0 deletions deploy/fake-gpu-operator/templates/status-exporter/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,21 @@ spec:
selector:
app: nvidia-dcgm-exporter
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
labels:
app: kwok-nvidia-dcgm-exporter
name: kwok-nvidia-dcgm-exporter
spec:
ports:
- name: gpu-metrics
port: 9400
protocol: TCP
targetPort: 9400
selector:
app: kwok-nvidia-dcgm-exporter
type: ClusterIP
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{{- if .Values.statusExporter.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: nvidia-dcgm-exporter
labels:
release: {{ .Release.Name }}
spec:
selector:
matchLabels:
app: nvidia-dcgm-exporter
endpoints:
- port: gpu-metrics
interval: 30s
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kwok-nvidia-dcgm-exporter
labels:
release: {{ .Release.Name }}
spec:
selector:
matchLabels:
app: kwok-nvidia-dcgm-exporter
endpoints:
- port: gpu-metrics
interval: 30s
honorLabels: true
{{- end }}
9 changes: 9 additions & 0 deletions deploy/fake-gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ statusExporter:
cpu: "200m"
memory: "200Mi"
topologyMaxExportInterval: 10s
serviceMonitor:
enabled: false

kwokGpuDevicePlugin:
image:
Expand Down Expand Up @@ -86,3 +88,10 @@ topology:
gpuMemory: 11441
nodePoolLabelKey: run.ai/simulated-gpu-node-pool
migStrategy: mixed

kwok:
tolerations:
- key: kwok.x-k8s.io/node
operator: Equal
value: fake
effect: NoSchedule
83 changes: 83 additions & 0 deletions design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
################ Active Whole GPU Pod ################

###### From DCGM Exporter Directly ######

{
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 300
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5000
# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C).
# TYPE DCGM_FI_DEV_GPU_TEMP gauge
DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 44
# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W).
# TYPE DCGM_FI_DEV_POWER_USAGE gauge
DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 28.822000
# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ).
# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 484260738
# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries.
# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter
DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %).
# TYPE DCGM_FI_DEV_GPU_UTIL gauge
DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %).
# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge
DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %).
# TYPE DCGM_FI_DEV_ENC_UTIL gauge
DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %).
# TYPE DCGM_FI_DEV_DEC_UTIL gauge
DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered.
# TYPE DCGM_FI_DEV_XID_ERRORS gauge
DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",err_code="0",err_msg="No Error",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB).
# TYPE DCGM_FI_DEV_FB_FREE gauge
DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 14914
# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB).
# TYPE DCGM_FI_DEV_FB_USED gauge
DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 2
# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes.
# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status
# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge
DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active.
# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge
DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.999983
# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active.
# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.000000
# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data.
# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge
DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.466046
# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge
DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5529374
# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge
DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 21260577
}

###### From Prometheus ######

{
DCGM_FI_DEV_FB_FREE{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"}
8422
DCGM_FI_DEV_FB_USED{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"}
6494
DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"}
100
}

###### Note ######
ip-172-20-10-26 is the hostname of the node where the GPU is installed.
Loading
Loading