From 7de612d65bcd3fbf8fc54a69317788921cca467c Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 17 Mar 2023 17:54:33 +0200 Subject: [PATCH 01/26] KEP: Initial version of the Pass down resources to CRI Co-authored-by: Antti Kervinen --- keps/prod-readiness/sig-node/4112.yaml | 7 + .../4112-passdown-resources-to-cri/README.md | 956 ++++++++++++++++++ .../4112-passdown-resources-to-cri/kep.yaml | 39 + 3 files changed, 1002 insertions(+) create mode 100644 keps/prod-readiness/sig-node/4112.yaml create mode 100644 keps/sig-node/4112-passdown-resources-to-cri/README.md create mode 100644 keps/sig-node/4112-passdown-resources-to-cri/kep.yaml diff --git a/keps/prod-readiness/sig-node/4112.yaml b/keps/prod-readiness/sig-node/4112.yaml new file mode 100644 index 00000000000..a178d8dffe7 --- /dev/null +++ b/keps/prod-readiness/sig-node/4112.yaml @@ -0,0 +1,7 @@ +# The KEP must have an approver from the +# "prod-readiness-approvers" group +# of http://git.k8s.io/enhancements/OWNERS_ALIASES +kep-number: 4112 +alpha: + approver: TBD + diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md new file mode 100644 index 00000000000..d9f9073e60c --- /dev/null +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -0,0 +1,956 @@ + +# [KEP-4112](https://github.com/kubernetes/enhancements/issues/4112): Pass down resources to CRI + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories](#user-stories) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Story 3](#story-3) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [CRI API](#cri-api) + - [ContainerConfig](#containerconfig) + - [UpdateContainerResourcesRequest](#updatecontainerresourcesrequest) + - [PodSandboxConfig](#podsandboxconfig) + - [kubelet](#kubelet) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Container annotations](#container-annotations) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +The kubelet does not provide complete information about the container +resources specification (requests and limits) to CRI. However, various use +cases have been identified where detailed knowledge of all the resources can be +utilized in container runtimes for more optimal resource allocation to improve +application performance and reduce cross-application interference. + +This KEP proposes a CRI API extension for disclosing container resource +requests and limits to container runtimes. + +## Motivation + +Kubelet manages the native resources (CPU and memory) and communicates +resource parameters over the CRI API to the runtime. However, the original +details of the resource spec are lost as they get translated (within kubelet) +to platform-specific (i.e. Linux or Windows) resource controller parameters +like cpu shares, memory limits etc. Non-native resources such as extended +resources and the device plugin resources are not visible to container runtimes +at all. + +However, VM-based runtimes such as +[Kata containers](https://katacontainers.io/), platform-optimized container +runtimes, +[OCI hooks](https://github.com/opencontainers/runtime-spec/blob/master/config.md), +[runC](https://github.com/opencontainers/runc) wrappers, +[NRI](https://github.com/containerd/nri) plugins or in some cases even +applications themselves would benefit on getting full resource information, +e.g. for reserving all required resources at pod sandbox creation (as it might +be hard to impossible after that) or doing customized resource optimization. +Extending the CRI API in to include the resource information would provide a +comprehensive view of all resource usage of containers, allowing improved +resource allocation without breaking any existing use cases. + +### Goals + +- make container resource spec transparently visible to CRI (the container + runtime) + +### Non-Goals + +- change kubelet resource management +- change existing behavior of CRI + +## Proposal + +### User Stories + +#### Story 1 + +As a VM-based container runtime developer, I want to allocate/expose enough +RAM, hugepages, GPU memory, protected memory sections and other resources for +the VM to ensure that all containers in the pod are guaranteed to get the +resources they require. + +#### Story 2 + +As a platform-optimized CRI runtime developer, I want to know detailed +container resource requests to be able to make optimal, platform specific, +resource allocations. Some of the resources may be handled outside cgroups and +be container runtime specific details. + +#### Story 3 + +As a cluster administrator, I want to install an OCI hook/runc wrapper/NRI +plugin that does customized resource handling. + +### Notes/Constraints/Caveats (Optional) + + + +### Risks and Mitigations + + + +The proposal only adds new informantional data to the CRI API between kubelet +and the container runtime with no user-visible changes which mitigates possible +risks considerably. + +Data duplication/inconsistency with native resources could be considered a risk +as those are passed down to CRI both as "raw" requests and limits and as +"translated" resource control parameters (like cpu shares oom scoring etc). But +this should be largely mitigated by code reviews and unit tests. + +## Design Details + +The proposal is that kubelet discloses full resources information from the +PodSpec to the container runtime. This is accomplished by extending the +ContainerConfig, UpdateContainerResourcesRequest and PodSandboxConfig messages +of the CRI API. + +With this information, the runtime can for example do detailed resource +allocation so that CPU, memory and other resources for each container are +optimally aligned. + +The resource information is included in PodSandboxConfig so that the runtime +can see the full picture of Pod's resource usage at Pod creation time, for +example enabling more holistic resource allocation and thus better +interoperability between containers inside the Pod. + +### CRI API + +#### ContainerConfig + +The ContainerConfig message (used in CreateContainer request) is extended to +contain unmodified resource requests from the PodSpec. + +```diff ++import "k8s.io/apimachinery/pkg/api/resource/generated.proto"; + + message ContainerConfig { + + ... + + // Configuration specific to Windows containers. + WindowsContainerConfig windows = 16; + + // CDI devices for the container. + repeated CDIDevice CDI_devices = 17; ++ ++ // Kubernetes resource spec of the container ++ repeated ContainerResourceConfig container_resources = 18; + } + ++message ContainerResourceConfig { ++ // Name of the container ++ string name= 1; ++ // Requests and limits hold corresponding container resources data. ++ map requests = 2; ++ map limits = 3; ++} +``` + +#### UpdateContainerResourcesRequest + +The UpdateContainerResourcesRequest message is extended to pass down unmodified +resource requests from the PodSpec. + +```diff + message UpdateContainerResourcesRequest { + // ID of the container to update. + string container_id = 1; + // Resource configuration specific to Linux containers. + LinuxContainerResources linux = 2; + // Resource configuration specific to Windows containers. + WindowsContainerResources windows = 3; + // Unstructured key-value map holding arbitrary additional information for + // container resources updating. This can be used for specifying experimental + // resources to update or other options to use when updating the container. + map annotations = 4; ++ ++ // Kubernetes resource spec of the container ++ ContainerResourceConfig container_resources = 5; + } +``` + +#### PodSandboxConfig + +The PodSandboxConfig message (part of the RunPodSandbox request) will be +extended to contain information about resources of all its containers known at +the pod creation time. The container resources here are non-binding and only +informational, e.g. for the runtime to prepare for optimal allocation of +resources of all the containers of the Pod. + +```diff + message PodSandboxConfig { + + ... + + // Optional configurations specific to Linux hosts. + LinuxPodSandboxConfig linux = 8; + // Optional configurations specific to Windows hosts. + WindowsPodSandboxConfig windows = 9; ++ ++ // Kubernetes resource spec of the containers in the pod. ++ PodResourceConfig pod_resources = 10; + } + ++message PodResourceConfig { ++ repeated ContainerResourceConfig init_containers = 1; ++ repeated ContainerResourceConfig containers = 2; ++} +``` + +### kubelet + +Kubelet will be be extended to pass down the unmodified resource requests and +limits to the container runtime in all related CRI requests, i.e. +RunPodSandbox, CreateContainer and UpdateContainerResources. + +For example, take a PodSpec: + +```yaml +apiVersion: v1 +kind: Pod +... +spec: + containers: + - name: cnt-1 + ... + resources: + requests: + cpu: 1 + memory: 1G + limits: + cpu: 2 + memory: 2G + - name: cnt-2 + ... + resources: + requests: + cpu: 100m + memory: 100M + vendor.com/xpu: 1 + limits: + cpu: 200m + memory: 200M + vendor.com/xpu: 1 +``` + +Then kubelet will send the following RunPodSandboxRequest when creating the Pod +(represented here in yaml format): + +```yaml +RunPodSandboxRequest: + config: + ... + podResources: + containers: + - name: cnt-1 + requests: + cpu: 1 + memory: 1G + limits: + cpu: 2 + memory: 2G + - name: cnt-2 + requests: + cpu: 100m + memory: 100M + vendor.com/xpu: 1 + limits: + cpu: 200m + memory: 200M + vendor.com/xpu: 1 +``` + +### Test Plan + + + +[ ] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +##### Unit tests + + + + + +- ``: `` - `` + +##### Integration tests + + + +- : + +##### e2e tests + + + +- : + +### Graduation Criteria + + + +### Upgrade / Downgrade Strategy + + + +### Version Skew Strategy + + + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [ ] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? + +###### Does enabling the feature change any default behavior? + + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + + +###### What happens if we reenable the feature if it was previously rolled back? + +###### Are there any tests for feature enablement/disablement? + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + + + +### Container annotations + +Container annotations could be used as an alternative way to pass down the +resource requests and limits to the container runtime. + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml new file mode 100644 index 00000000000..b75388fa6d7 --- /dev/null +++ b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml @@ -0,0 +1,39 @@ +title: Pass down resources to CRI +kep-number: 4112 +authors: + - "@marquiz" + - "@askervin" +owning-sig: sig-node +participating-sigs: [] +status: provisional +creation-date: 2023-06-28 +reviewers: + - TBD +approvers: + - TBD + +see-also: [] +replaces: [] + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.29" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.29" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: TBD + components: + - kubelet +disable-supported: true + +# The following PRR answers are required at beta release +metrics: [] From bc8e2999007673cab3c675f4f5a19f5982b9d9e8 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 18 Jul 2023 11:29:03 +0300 Subject: [PATCH 02/26] KEP-4112: address review feedback from haircommander --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index d9f9073e60c..3699423843e 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -284,6 +284,8 @@ contain unmodified resource requests from the PodSpec. +message ContainerResourceConfig { + // Name of the container ++ // Note: name is redundant for container-specific requests (UpdateContainerResourcesRequest) ++ // but needed for pod-specific requests (PodSandboxConfig). + string name= 1; + // Requests and limits hold corresponding container resources data. + map requests = 2; From 5d220511e9cca9642ec1fdef086a28b17e12b158 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 21 Sep 2023 18:01:17 +0300 Subject: [PATCH 03/26] KEP-4112: add CDI devices and mounts --- .../4112-passdown-resources-to-cri/README.md | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 3699423843e..d11da8f2f24 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -279,17 +279,13 @@ contain unmodified resource requests from the PodSpec. repeated CDIDevice CDI_devices = 17; + + // Kubernetes resource spec of the container -+ repeated ContainerResourceConfig container_resources = 18; ++ KubernetesResources kubernetes_resources = 18; } -+message ContainerResourceConfig { -+ // Name of the container -+ // Note: name is redundant for container-specific requests (UpdateContainerResourcesRequest) -+ // but needed for pod-specific requests (PodSandboxConfig). -+ string name= 1; -+ // Requests and limits hold corresponding container resources data. -+ map requests = 2; -+ map limits = 3; ++message KubernetesResources { ++ // Requests and limits from the Kubernetes container config. ++ map requests = 1; ++ map limits = 2; +} ``` @@ -312,7 +308,7 @@ resource requests from the PodSpec. map annotations = 4; + + // Kubernetes resource spec of the container -+ ContainerResourceConfig container_resources = 5; ++ KubernetesResources kubernetes_resources = 18; } ``` @@ -342,6 +338,20 @@ resources of all the containers of the Pod. + repeated ContainerResourceConfig init_containers = 1; + repeated ContainerResourceConfig containers = 2; +} + ++message ContainerResourceConfig { ++ // Name of the container ++ string name= 1; ++ ++ // Kubernetes resource spec of the container ++ KubernetesResources kubernetes_resources = 2; ++ ++ // CDI devices for the container. ++ repeated CDIDevice CDI_devices = 3; ++ ++ // Mounts for the container. ++ repeated Mount mounts = 4; ++} ``` ### kubelet From eabe5f1451e84f04b0b3231720cf6e046ef49022 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 21 Sep 2023 20:19:18 +0300 Subject: [PATCH 04/26] KEP-4112: small fix to protobuf spec --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index d11da8f2f24..c66ff9fcac8 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -308,7 +308,7 @@ resource requests from the PodSpec. map annotations = 4; + + // Kubernetes resource spec of the container -+ KubernetesResources kubernetes_resources = 18; ++ KubernetesResources kubernetes_resources = 5; } ``` From 8de0aad121803e6b12942c709e0a0e9f9b212a75 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 22 Sep 2023 11:09:57 +0300 Subject: [PATCH 05/26] KEP-4112: added devices to the pod-level resource info --- .../4112-passdown-resources-to-cri/README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index c66ff9fcac8..add8d606f33 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -282,6 +282,8 @@ contain unmodified resource requests from the PodSpec. + KubernetesResources kubernetes_resources = 18; } ++// KubernetesResources contains the resource requests and limits as specified ++// in the Kubernetes core API ResourceRequirements. +message KubernetesResources { + // Requests and limits from the Kubernetes container config. + map requests = 1; @@ -334,11 +336,15 @@ resources of all the containers of the Pod. + PodResourceConfig pod_resources = 10; } ++// PodResourceConfig contains information of all resources requirements of ++// the containers of a pod. +message PodResourceConfig { + repeated ContainerResourceConfig init_containers = 1; + repeated ContainerResourceConfig containers = 2; +} ++// ContainerResourceConfig contains information of all resource requirements of ++// one container. +message ContainerResourceConfig { + // Name of the container + string name= 1; @@ -346,11 +352,14 @@ resources of all the containers of the Pod. + // Kubernetes resource spec of the container + KubernetesResources kubernetes_resources = 2; + -+ // CDI devices for the container. -+ repeated CDIDevice CDI_devices = 3; -+ + // Mounts for the container. -+ repeated Mount mounts = 4; ++ repeated Mount mounts = 3; ++ ++ // Devices for the container. ++ repeated Device devices = 4; ++ ++ // CDI devices for the container. ++ repeated CDIDevice CDI_devices = 5; +} ``` From 868bc5c1056d705faa6973b086c2715f8222cc04 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 26 Sep 2023 10:07:22 +0300 Subject: [PATCH 06/26] KEP-4112: edited summary, motivation and goals --- .../4112-passdown-resources-to-cri/README.md | 72 ++++++++++++------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index add8d606f33..2c251e7d281 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -145,40 +145,60 @@ Items marked with (R) are required *prior to targeting to a milestone / release* ## Summary -The kubelet does not provide complete information about the container -resources specification (requests and limits) to CRI. However, various use -cases have been identified where detailed knowledge of all the resources can be -utilized in container runtimes for more optimal resource allocation to improve -application performance and reduce cross-application interference. +The CRI runtime lacks visibility to the application resource requirements. -This KEP proposes a CRI API extension for disclosing container resource -requests and limits to container runtimes. +First, the resources required by the containers of a pod are not visible at the +pod sandbox creation time. This can be problematic for example in the case of +VM-based runtimes where all resources need to be reserved/prepared when the VM +(i.e. sandbox) is being created. + +Second, the kubelet does not provide complete information about the container +resources specification of native and extended resources (requests and limits) +to CRI. However, various use cases have been identified where detailed +knowledge of all the resources can be utilized in container runtimes for more +optimal resource allocation to improve application performance and reduce +cross-application interference. + +This KEP proposes CRI API extensions for providing complete view of pods +resources at sandbox creation, and, providing unobfuscated information about +the resource requests and limits to container runtimes. ## Motivation -Kubelet manages the native resources (CPU and memory) and communicates -resource parameters over the CRI API to the runtime. However, the original -details of the resource spec are lost as they get translated (within kubelet) -to platform-specific (i.e. Linux or Windows) resource controller parameters -like cpu shares, memory limits etc. Non-native resources such as extended -resources and the device plugin resources are not visible to container runtimes -at all. - -However, VM-based runtimes such as -[Kata containers](https://katacontainers.io/), platform-optimized container -runtimes, +When the pod sandbox is created, the kubelet does not provide the CRI runtime +any information about the resources (such as native resources, host devices, +mounts, CDI devices etc) that will be required by the application. The CRI +runtime only becomes aware of the resources piece by piece when containers of +the pod are created (one-by-one). This can cause issues with VM-based runtimes +(e.g. [Kata containers](https://katacontainers.io/)) that need to prepare the +VM, with all needed resources, at sandbox creation and cannot do any +modifications (i.e. attach/reserve new resources) after that (i.e. when +individual containers are created). + +Another visibility issue is related to the native and extended resources. +Kubelet manages the native resources (CPU and memory) and communicates resource +parameters over the CRI API to the runtime. However, the original details of +the resource spec are lost as they get translated (within kubelet) to +platform-specific (i.e. Linux or Windows) resource controller parameters like +cpu shares, memory limits etc. Non-native resources such as extended resources +and the device plugin resources completely invisible to the CRI runtime. However, [OCI hooks](https://github.com/opencontainers/runtime-spec/blob/master/config.md), [runC](https://github.com/opencontainers/runc) wrappers, [NRI](https://github.com/containerd/nri) plugins or in some cases even -applications themselves would benefit on getting full resource information, -e.g. for reserving all required resources at pod sandbox creation (as it might -be hard to impossible after that) or doing customized resource optimization. -Extending the CRI API in to include the resource information would provide a -comprehensive view of all resource usage of containers, allowing improved -resource allocation without breaking any existing use cases. +applications themselves would benefit on seeing the original resource requests +and limits e.g. for doing customized resource optimization. + +Extending the CRI API to communicate all resources already at sandbox creation +and pass down resource requests and limits (of native and extended resources) +would provide a comprehensive and early-enough view of the resource usage of +all containers of the pod, allowing improved resource allocation without +breaking any existing use cases. ### Goals +- make the information about all required resources (e.g. native and extended + resources, devices, mounts, CDI devices) of a Pod available to the CRI at + sandbox creation time - make container resource spec transparently visible to CRI (the container runtime) @@ -365,6 +385,10 @@ resources of all the containers of the Pod. ### kubelet +Kubelet code is refactored/modified so that all container resources are known +before sandbox creation. This mainly consists of preparing all mounts (of all +containers) early. + Kubelet will be be extended to pass down the unmodified resource requests and limits to the container runtime in all related CRI requests, i.e. RunPodSandbox, CreateContainer and UpdateContainerResources. From 57279d611cec3034d7e2c38c5fbbbd9544967d0c Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 4 Oct 2023 12:06:15 +0000 Subject: [PATCH 07/26] KEP-4112: detailed description for Kata and Confidential Container --- .../4112-passdown-resources-to-cri/README.md | 61 ++++++++++++++++--- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 2c251e7d281..b990d55f381 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -169,15 +169,58 @@ When the pod sandbox is created, the kubelet does not provide the CRI runtime any information about the resources (such as native resources, host devices, mounts, CDI devices etc) that will be required by the application. The CRI runtime only becomes aware of the resources piece by piece when containers of -the pod are created (one-by-one). This can cause issues with VM-based runtimes -(e.g. [Kata containers](https://katacontainers.io/)) that need to prepare the -VM, with all needed resources, at sandbox creation and cannot do any -modifications (i.e. attach/reserve new resources) after that (i.e. when -individual containers are created). +the pod are created (one-by-one). + +This can cause issues with VM-based runtimes +(e.g. [Kata containers](https://katacontainers.io/) and [Confidential Containers](https://www.cncf.io/projects/confidential-containers/)) that need to prepare the VM before containers are created. + +For Kata to handle PCIe devices properly the CRI needs to tell the kata-runtime +how many PCIe root-ports or PCIe switch-ports the hypervisor needs to create at +sandbox creation depending on the number of devices allocated by the containers. +The PCIe root-port is a static configuration and the hypervisor cannot adjust it +once the sandbox is created. During container creation the PCIe devices are +hot-plugged to the PCIe root-port or switch-port. If the number of pre-allocated +pluggable ports is too low, the attachment will fail (container devices > +pre-allocated hot-pluggable ports). + +In the case of Confidential Containers (uses Kata unter the hood with additional +software components for attestation) the CRI needs to consider the cold-plug aka +direct attachment use-case. At sandbox creation time the hypervisor needs to +know the exact number of pass-through devices and its properties +(VFIO IOMMU group, the actual VFIO device - there can be several devices in a +IOMMU group, attach to PCIe root-port or PCIe switch-port (PCI-Bridge)). +In a confidential setting a user does not want to reconfigure the VM +(creates an attack-vector) on every create container request. The hypervisor +needs a fully static view of resources needed for VM sizing. + +Independent of hot or cold-plug the hypervisor needs to know how the PCI(e) +topology needs to look like at sandbox creation time. + +Updating resources of a container means also resizing the VM, hence the +hypervisors needs the complete list of resources available at a update container +request. Another visibility issue is related to the native and extended resources. Kubelet manages the native resources (CPU and memory) and communicates resource -parameters over the CRI API to the runtime. However, the original details of +parameters over the CRI API to the runtime. The following snippet shows the +currently supported CRI annotations that are provided by the Kubelet to e.g. +`containerd`: + +```sh +pkg/cri/annotations/annotations.go + + // SandboxCPU annotations are based on the initial CPU configuration for the sandbox. This is calculated as the + // sum of container CPU resources, optionally provided by Kubelet (introduced in 1.23) as part of the PodSandboxConfig + SandboxCPUPeriod = "io.kubernetes.cri.sandbox-cpu-period" + SandboxCPUQuota = "io.kubernetes.cri.sandbox-cpu-quota" + SandboxCPUShares = "io.kubernetes.cri.sandbox-cpu-shares" + + // SandboxMemory is the initial amount of memory associated with this sandbox. This is calculated as the sum + // of container memory, optionally provided by Kubelet (introduced in 1.23) as part of the PodSandboxConfig. + SandboxMem = "io.kubernetes.cri.sandbox-memory" +``` + +However, the original details of the resource spec are lost as they get translated (within kubelet) to platform-specific (i.e. Linux or Windows) resource controller parameters like cpu shares, memory limits etc. Non-native resources such as extended resources @@ -214,9 +257,9 @@ breaking any existing use cases. #### Story 1 As a VM-based container runtime developer, I want to allocate/expose enough -RAM, hugepages, GPU memory, protected memory sections and other resources for -the VM to ensure that all containers in the pod are guaranteed to get the -resources they require. +RAM, hugepages, hot- or cold-pluggable PCI(e) ports, protected memory sections +and other resources for the VM to ensure that all containers in the pod are +guaranteed to get the resources they require. #### Story 2 From f5e890f96850f589c93e0a56866057f70fd3c162 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 15 Nov 2023 10:58:57 +0200 Subject: [PATCH 08/26] KEP-4112: update milestones in kep.yaml --- keps/sig-node/4112-passdown-resources-to-cri/kep.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml index b75388fa6d7..628676bc15f 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml +++ b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml @@ -21,11 +21,11 @@ stage: alpha # The most recent milestone for which work toward delivery of this KEP has been # done. This can be the current (upcoming) milestone, if it is being actively # worked on. -latest-milestone: "v1.29" +latest-milestone: "v1.30" # The milestone at which this feature was, or is targeted to be, at each stage. milestone: - alpha: "v1.29" + alpha: "v1.30" # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled From 870d027b872dfa289421353e6a7005d59c210bf0 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 26 Jan 2024 11:27:39 +0200 Subject: [PATCH 09/26] KEP-4112: update - upated user story 3 about customized resource management - updated example pod spec vs RunPodSandboxRequest proto - added a note about device plugin resources --- .../4112-passdown-resources-to-cri/README.md | 88 +++++++++++-------- 1 file changed, 49 insertions(+), 39 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index b990d55f381..290b20fe06d 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -194,7 +194,7 @@ In a confidential setting a user does not want to reconfigure the VM needs a fully static view of resources needed for VM sizing. Independent of hot or cold-plug the hypervisor needs to know how the PCI(e) -topology needs to look like at sandbox creation time. +topology needs to look like at sandbox creation time. Updating resources of a container means also resizing the VM, hence the hypervisors needs the complete list of resources available at a update container @@ -270,8 +270,11 @@ be container runtime specific details. #### Story 3 -As a cluster administrator, I want to install an OCI hook/runc wrapper/NRI -plugin that does customized resource handling. +As a cluster administrator, I want to install an NRI plugin that does +customized resource handling. I run kubelet with CPU manager and memory manager +disabled. Instead I use my NRI plugin to do customized resource allocation +(e.g. cpu and memory pinning). To do that properly I need the actual resource +requests and limits requested by the user. ### Notes/Constraints/Caveats (Optional) @@ -432,9 +435,10 @@ Kubelet code is refactored/modified so that all container resources are known before sandbox creation. This mainly consists of preparing all mounts (of all containers) early. -Kubelet will be be extended to pass down the unmodified resource requests and -limits to the container runtime in all related CRI requests, i.e. -RunPodSandbox, CreateContainer and UpdateContainerResources. +Kubelet will be be extended to pass down all mounts, devices, CDI devices and +the unmodified resource requests and limits to the container runtime in all +related CRI requests, i.e. RunPodSandbox, CreateContainer and +UpdateContainerResources. For example, take a PodSpec: @@ -444,26 +448,23 @@ kind: Pod ... spec: containers: - - name: cnt-1 - ... - resources: - requests: - cpu: 1 - memory: 1G - limits: - cpu: 2 - memory: 2G - - name: cnt-2 - ... - resources: - requests: - cpu: 100m - memory: 100M - vendor.com/xpu: 1 - limits: - cpu: 200m - memory: 200M - vendor.com/xpu: 1 + - name: cnt-1 + image: k8s.gcr.io/pause + resources: + requests: + cpu: 1 + memory: 1G + example.com/resource: 1 + limits: + cpu: 2 + memory: 2G + example.com/resource: 1 + volumeMounts: + - mountPath: /my-volume + name: my-volume + volumes: + - name: my-volume + emptyDir: ``` Then kubelet will send the following RunPodSandboxRequest when creating the Pod @@ -475,24 +476,33 @@ RunPodSandboxRequest: ... podResources: containers: - - name: cnt-1 + - name: cnt-1 + kubernetes_resources: requests: - cpu: 1 + cpu: "1" memory: 1G + example.com/resource: "1" limits: - cpu: 2 - memory: 2G - - name: cnt-2 - requests: - cpu: 100m - memory: 100M - vendor.com/xpu: 1 - limits: - cpu: 200m - memory: 200M - vendor.com/xpu: 1 + cpu: "2" + memory: 2G + example.com/resource: "1" + CDI_devices: + - name: example.com/resource=CDI-Dev-1 + mounts: + - container_path: /my-volume + host_path: /var/lib/kubelet/pods//volumes/kubernetes.io~empty-dir/my-volume + - container_path: /var/run/secrets/kubernetes.io/serviceaccount + host_path: /var/lib/kubelet/pods//volumes/kubernetes.io~projected/kube-api-access-4srqm + readonly: true + - container_path: /dev/termination-log + host_path: /var/lib/kubelet/pods//containers/cnt-1/ ``` +Note that all device plugin resources are passed down in the +`kubernetes_resources` field but this does not contain any properties of the +device that was actually allocated for the container. However, these properties +are exposed through the `CDI_devices`, `mounts` and `devices` fields. + ### Test Plan -[ ] I/we understand the owners of the involved components may require updates to +[x] I/we understand the owners of the involved components may require updates to existing tests to make this code solid enough prior to committing the changes necessary to implement this enhancement. @@ -527,6 +540,8 @@ Based on reviewers feedback describe what additional tests need to be added prio implementing this enhancement to ensure the enhancements have also solid foundations. --> +No prerequisite testing updates have been identified. + ##### Unit tests -- ``: `` - `` +- `k8s.io/kubernetes/pkg/kubelet/kuberuntime`: `2024-02-02` - `68.3%` + +The +[fake_runtime](https://github.com/kubernetes/cri-api/blob/master/pkg/apis/testing/fake_runtime_service.go) +will be used in unit tests to verify that the Kubelet correctly passes down the +resource information to the CRI runtime. ##### Integration tests @@ -560,7 +580,7 @@ For Beta and GA, add links to added tests together with links to k8s-triage for https://storage.googleapis.com/k8s-triage/index.html --> -- : +For alpha, no new integration tests are planned. ##### e2e tests @@ -574,7 +594,7 @@ https://storage.googleapis.com/k8s-triage/index.html We expect no non-infra related flakes in the last month as a GA graduation criteria. --> -- : +For alpha, no new e2e tests are planned. ### Graduation Criteria @@ -640,6 +660,25 @@ in back-to-back releases. - Deprecate the flag --> +#### Alpha + +- Feature implemented behind a feature flag +- Initial unit tests completed and enabled + +#### Beta + +- Gather feedback from developers and surveys +- Feature gate enabled by default +- containerd and CRI-O runtimes have released versions that have adopted the + new CRI API changes +- The [NRI API](https://github.com/containerd/nri) has adopted the feature + +#### GA + +- No bugs reported in the previous cycle +- N examples of real-world usage +- N installs + ### Upgrade / Downgrade Strategy +The feature gate (in kubelet) controls the feature enablement. Existing runtime +implementations will continue to work as previously, even if the feature is +enabled. + ### Version Skew Strategy +The feature is node-local (kubelet-only) so there is no dependencies or effects +to other Kubernetes components. + +The behavior is unchanged if either kubelet or the CRI runtime running on a +node does not support the feature. If kubelet has the feature enabled but the +CRI runtime does not support it, the CRI runtime will ignore the new fields in +the CRI API and function as previously. Similarly, if the CRI runtime supports +the feature but the kubelet does not, the runtime will resort to the previous +behavior. + ## Production Readiness Review Questionnaire -- [ ] Feature gate (also fill in values in `kep.yaml`) - - Feature gate name: +- [X] Feature gate + - Feature gate name: KubeletContainerResourcesInPodSandbox - Components depending on the feature gate: -- [ ] Other - - Describe the mechanism: - - Will enabling / disabling the feature require downtime of the control - plane? - - Will enabling / disabling the feature require downtime or reprovisioning - of a node? + - kubelet ###### Does enabling the feature change any default behavior? diff --git a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml index 58a30cf6498..2a5055a488d 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml +++ b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml @@ -8,9 +8,9 @@ participating-sigs: [] status: provisional creation-date: 2023-06-28 reviewers: - - TBD + - "@mikebrow" approvers: - - TBD + - "@haircommander" see-also: [] replaces: [] @@ -30,7 +30,7 @@ milestone: # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled feature-gates: - - name: TBD + - name: KubeletContainerResourcesInPodSandbox components: - kubelet disable-supported: true From 306f2906a15a08c5699907d320f83cedf3d43c1e Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 8 Oct 2024 15:59:21 +0300 Subject: [PATCH 14/26] KEP-4112: address review comments - add image volume example (mikebrow) - fix typos - update PRR --- .../4112-passdown-resources-to-cri/README.md | 106 +++++++++++++++--- 1 file changed, 92 insertions(+), 14 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 48e0b3e249c..b433cc31132 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -302,7 +302,7 @@ How will UX be reviewed, and by whom? Consider including folks who also work outside the SIG or subproject. --> -The proposal only adds new informantional data to the CRI API between kubelet +The proposal only adds new informational data to the CRI API between kubelet and the container runtime with no user-visible changes which mitigates possible risks considerably. @@ -495,9 +495,14 @@ spec: volumeMounts: - mountPath: /my-volume name: my-volume + - mountPath: /image-volume + name: image-volume volumes: - name: my-volume emptyDir: + - name: image-volume + image: + reference: example.com/registry/artifact:tag ``` Then kubelet will send the following RunPodSandboxRequest when creating the Pod @@ -524,6 +529,10 @@ RunPodSandboxRequest: mounts: - container_path: /my-volume host_path: /var/lib/kubelet/pods//volumes/kubernetes.io~empty-dir/my-volume + - container_path: /image-volume + image: + image: example.com/registry/artifact:tag + ... - container_path: /var/run/secrets/kubernetes.io/serviceaccount host_path: /var/lib/kubelet/pods//volumes/kubernetes.io~projected/kube-api-access-4srqm readonly: true @@ -798,6 +807,12 @@ Any change of default behavior may be surprising to users or break existing automations, so be extremely careful here. --> +The default behavior in Kubernetes is unchanged. + +However, there might be changes in behavior if the underlying CRI runtime +depends on this feature. For example, an NRI plugin relying on the feature may +cause the application to behave differently. + ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? +Yes, disabling the `KubeletContainerResourcesInPodSandbox` feature gate will +disable the feature. Restarting pods may be needed to reset the information +that was passed down to the CRI. + ###### What happens if we reenable the feature if it was previously rolled back? +New pods will have the feature enabled. Existing pods will continue to operate +as before until restarted. + ###### Are there any tests for feature enablement/disablement? +Unit tests for the feature gate will be added. + ### Rollout, Upgrade and Rollback Planning +Rollback or rollout in the kubelet should not fail - it only enables/disabled +the information (fields in the CRI message) passed down to the CRI runtime. + +However, if the CRI runtime depends on the feature, a rollout or rollback may +cause failures of applications on pod restarts. Running pods are not affected. + ###### What specific metrics should inform a rollback? +Alpha: No new metrics are planned. Non-ready pods with CreatePodSandboxError +status is one indicator. The error message will contain details if the CRI +failure is related to the feature. + ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? +Alpha: Manual testing of the feature gate is performed. + ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? +No. + ### Monitoring Requirements +By examing the kubelet configuration (feature gate) and the version of the +kubelet and the CRI runtime. + ###### How can someone using this feature know that it is working for their instance? -- [ ] Events - - Event Reason: -- [ ] API .status - - Condition name: - - Other field: -- [ ] Other (treat as last resort) - - Details: +The end users do not see the status of the feature directly. + +The cluster operator can verify that the feature is working by examining the +kubelet and CRI runtime logs. + +The CRI runtime or NRI plugin developers depending on the feature can ensure +that it is working by verifying that all the required information is available +at pod sandbox creation time. ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? @@ -920,18 +962,15 @@ These goals will help you determine what you need to measure (SLIs) in the next question. --> +N/A. + ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? -- [ ] Metrics - - Metric name: - - [Optional] Aggregation method: - - Components exposing the metric: -- [ ] Other (treat as last resort) - - Details: +N/A. ###### Are there any missing metrics that would be useful to have to improve observability of this feature? @@ -940,6 +979,8 @@ Describe the metrics themselves and the reasons why they weren't added (e.g., co implementation difficulties, etc.). --> +N/A. + ### Dependencies +No. + +However, the practical usability of this feature requires that also the CRI +runtime supports it. The feature is effectively a no-op if the CRI runtime does +not support it. + ### Scalability +No. + ###### Will enabling / using this feature result in introducing new API types? +No. + ###### Will enabling / using this feature result in any new calls to the cloud provider? +No. + ###### Will enabling / using this feature result in increasing size or count of the existing API objects? +No. + ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? +Not noticeably. + ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? +No. The new data fields in the CRI API would not count as significant increase. + ###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? +No. + ### Troubleshooting +The feature in Kubernetes is relatively straightforward - passing extra +information to the CRI runtime. The failure scenarios arise in the CRI runtime +level, e.g.: + +- misbehaving CRI runtime or NRI plugin +- CRI runtime or NRI plugin is depending on the feature but it is not enabled + in the kubelet +- configuration skew in the cluster where some nodes have the feature enabled + and some do not + +Pod events and CRI runtime logs are the primary sources of information for +these failure scenarios. + ###### What steps should be taken if SLOs are not being met to determine the problem? +N/A. + ## Implementation History + + + +[kep-1287]: https://github.com/kubernetes/enhancements/issues/1287 +[kep-1287-beta-pr]: https://github.com/kubernetes/enhancements/pull/4704 +[kep-2837]: https://github.com/kubernetes/enhancements/issues/2837 +[kep-2837-alpha-pr]: https://github.com/kubernetes/enhancements/pull/4678 From c4efee477af498e3afb427a0592877ef294c3e71 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 8 Oct 2024 20:46:30 +0300 Subject: [PATCH 16/26] KEP-4112: update toc --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 8b0e017bf47..36bed368228 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -78,6 +78,7 @@ SIG Architecture for cross-cutting KEPs). - [PodSandboxConfig](#podsandboxconfig) - [CreateContainer](#createcontainer) - [UpdateContainerResourcesRequest](#updatecontainerresourcesrequest) + - [UpdatePodSandboxResources](#updatepodsandboxresources) - [kubelet](#kubelet) - [Test Plan](#test-plan) - [Prerequisite testing updates](#prerequisite-testing-updates) From d3822099bcafe522073d834eb6172e76233a8b63 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 8 Oct 2024 21:08:22 +0300 Subject: [PATCH 17/26] KEP-4112: drop NRI API from the beta criteria --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 36bed368228..bcd486ce45f 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -764,7 +764,6 @@ in back-to-back releases. - Feature gate enabled by default - containerd and CRI-O runtimes have released versions that have adopted the new CRI API changes -- The [NRI API](https://github.com/containerd/nri) has adopted the feature #### GA From 2bd0c741b67f0b911c168a95cd3fbf0d453296b1 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 8 Oct 2024 21:37:55 +0300 Subject: [PATCH 18/26] KEP-4112: added not about possible metadata-skew --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index bcd486ce45f..82b9a66e3c9 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -872,6 +872,15 @@ However, there might be changes in behavior if the underlying CRI runtime depends on this feature. For example, an NRI plugin relying on the feature may cause the application to behave differently. +Long running pods that persist (without restart) over kubelet and CRI runtime +update which enables the feature may experience version skew of the metadata. +After enabling the feature, the CRI runtime does not have the aggregated +information of all resources of the pod, provided with this feature, as the +kubelet didn't restart these pods (didn't send the CreatePodSandbox CRI +request). This may affect some scenarios e.g. NRI plugins. This "metadata skew" +can be avoided by draining the node before updating the kubelet and the CRI +runtime. + ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? -Alpha: No new metrics are planned. Non-ready pods with CreatePodSandboxError -status is one indicator. The error message will contain details if the CRI -failure is related to the feature. +Alpha: No new metrics are planned. Increase in the existing +`kubelet_started_pods_errors_total` metric can indicate a problem caused by +this feature. + +Generally, non-ready pods with CreatePodSandboxError status (reflected by the +`kubelet_started_pods_errors_total` metric) is a possible indicator. The error +message will contain details if the CRI failure is related to the feature. ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? @@ -990,8 +994,9 @@ checking if there are objects with field X set) may be a last resort. Avoid logs or events for this purpose. --> -By examing the kubelet configuration (feature gate) and the version of the -kubelet and the CRI runtime. +By examing the kubelet feature gate and the version of the CRI runtime. The +enablement of the kubelet feature gate can be determined from the +`kubernetes_feature_enabled` metric. ###### How can someone using this feature know that it is working for their instance? @@ -1030,15 +1035,18 @@ These goals will help you determine what you need to measure (SLIs) in the next question. --> -N/A. +No increase in the `kubelet_started_pods_errors_total` rate. ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? - +- [x] Metrics + - Metric name: `kubelet_started_pods_errors_total` + - Components exposing the metric: kubelet -N/A. +> NOTE: The `kubelet_started_pods_errors_total` metric is a general metric for +> any errors that occur when starting pods. The error message (Pod events, +> kubelet logs) will contain details if the CRI failure is related to the +> feature. ###### Are there any missing metrics that would be useful to have to improve observability of this feature? diff --git a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml index 2a5055a488d..390557fb1c2 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml +++ b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml @@ -5,7 +5,7 @@ authors: - "@askervin" owning-sig: sig-node participating-sigs: [] -status: provisional +status: implementable creation-date: 2023-06-28 reviewers: - "@mikebrow" From e1d785a2e4132fb9658d1e6134bcbc75a2032bd2 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 9 Oct 2024 09:14:28 +0300 Subject: [PATCH 20/26] KEP-4112: update PRR --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 08ceacadc49..633900bb7e5 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -866,11 +866,9 @@ Any change of default behavior may be surprising to users or break existing automations, so be extremely careful here. --> -The default behavior in Kubernetes is unchanged. - -However, there might be changes in behavior if the underlying CRI runtime -depends on this feature. For example, an NRI plugin relying on the feature may -cause the application to behave differently. +There might be changes in behavior if the underlying CRI runtime depends on +this feature. For example, an NRI plugin relying on the feature may cause the +application to behave differently. Long running pods that persist (without restart) over kubelet and CRI runtime update which enables the feature may experience version skew of the metadata. From f5009aa3041434d649a10e1d1c6f78ba4cfaf744 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 9 Oct 2024 17:51:36 +0300 Subject: [PATCH 21/26] KEP-4112: update Add justification for updating including Kubernetes resources (requests/limits) in CreateContainer message. --- .../4112-passdown-resources-to-cri/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 633900bb7e5..441cc569050 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -330,6 +330,11 @@ can see the full picture of Pod's resource usage at Pod creation time, for example enabling more holistic resource allocation and thus better interoperability between containers inside the Pod. +Also the CreateContainer request is extended to include the unmodified resource +requirements. This make it possible for the CRI runtime to detect any changes +in the pod resources that happen between the Pod creation and container +creation in e.g. scenarios where in-place pod updates are involved. + The UpdatePodSandboxResources CRI message is also updated when/if that is introduced by the [KEP-1287][kep-1287] Beta ([PR][kep-1287-beta-pr]). @@ -452,6 +457,17 @@ contain unmodified resource requests from the PodSpec. Note that mounts, devices, CDI devices are part of the ContainerConfig message but are left out of the diff snippet above. +Including the KubernetesResources in the ContainerConfig message serves +multiple purposes: + +1. Catch changes that happen between pod sandbox creation and container + creation. For example, in-place pod updates might change the container + before it was created. +2. Catch changes that happen over container restarts in in-place pod update + scenarios +3. Consistency/completeness. Have enough information to make consistent action + based only on information present in this rpc caal. + The resources (mounts, devices, CDI devices, Kubernetes resources) in the CreateContainer request should be identical to what was (pre-)informed in the RunPodSandbox request. If they are different, the CRI runtime may fail the From dfb1bc60c68e92ce82eb7b8e983237a525651605 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 9 Oct 2024 18:00:00 +0300 Subject: [PATCH 22/26] KEP-4112: update PRR questionnaire --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 441cc569050..47d0e61f065 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -882,9 +882,10 @@ Any change of default behavior may be surprising to users or break existing automations, so be extremely careful here. --> -There might be changes in behavior if the underlying CRI runtime depends on -this feature. For example, an NRI plugin relying on the feature may cause the -application to behave differently. +Yes. The kubelet will start passing the extra information to the CRI runtime +for every container it creates. Whether this has any effect depends on if the +underlying CRI runtime supports this feature. For example, an NRI plugin +relying on the feature may cause the application to behave differently. Long running pods that persist (without restart) over kubelet and CRI runtime update which enables the feature may experience version skew of the metadata. From 5f066a00a7c7e79ccd08e4168d9f80bf5d736565 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 9 Oct 2024 21:50:52 +0300 Subject: [PATCH 23/26] KEP-4112: address review feedback - rename enum REGULAR_CONTAINER -> CONTAINER - update references to other related keps under Design Details --- .../4112-passdown-resources-to-cri/README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 47d0e61f065..6b64cacc8e6 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -335,12 +335,16 @@ requirements. This make it possible for the CRI runtime to detect any changes in the pod resources that happen between the Pod creation and container creation in e.g. scenarios where in-place pod updates are involved. -The UpdatePodSandboxResources CRI message is also updated when/if that is -introduced by the [KEP-1287][kep-1287] Beta ([PR][kep-1287-beta-pr]). +[KEP-1287][kep-1287] Beta ([PR][kep-1287-beta-pr]) proposes to add new +UpdatePodSandboxResources rpc to the CRI API. If/when KEP-1287 is implemented +as proposed, the UpdatePodSandboxResources CRI message is updated to include +the resource information of all containers (aligning with +UpdateContainerResourcesRequest). -Information about the Pod-level resources are added when/if the Pod-level -resources enhancement [KEP-2837][kep-2837] Alpha ([PR][kep-2837-alpha-pr]) is -implemented. +[KEP-2837][kep-2837] Alpha ([PR][kep-2837-alpha-pr]) proposes to add new +Pod-level resource requirements field to the PodSpec. This information will be +be added to the PodResourceConfig message, similar to the container resource +information, if/when KEP-2837 is implemented as proposed. ### CRI API @@ -402,7 +406,7 @@ request. +enum ContainerType { + INIT_CONTAINER = 0; + SIDECAR_CONTAINER = 1; -+ REGULAR_CONTAINER = 2; ++ CONTAINER = 2; +} ``` From ea204299c71c4571ff0f38fa51e589e9104ccb16 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 10 Oct 2024 00:12:50 +0300 Subject: [PATCH 24/26] KEP-4112: update --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 6b64cacc8e6..c526f77f003 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -278,9 +278,9 @@ what the user requested to fairly allocate resources between applications. As a cluster administrator, I want to install an NRI plugin that does customized resource handling. I run kubelet with CPU manager and memory manager -disabled. Instead I use my NRI plugin to do customized resource allocation -(e.g. cpu and memory pinning). To do that properly I need the actual resource -requests and limits requested by the user. +disabled (CPU manager policy set to `none`). Instead I use my NRI plugin to do +customized resource allocation (e.g. cpu and memory pinning). To do that +properly I need the actual resource requests and limits requested by the user. ### Notes/Constraints/Caveats (Optional) @@ -323,7 +323,8 @@ of the CRI API. With this information, the runtime can for example do detailed resource allocation so that CPU, memory and other resources for each container are -optimally aligned. +optimally aligned. This applies to scenarios where the kubelet CPU manager is +disabled (by using the `none` CPU manager policy). The resource information is included in PodSandboxConfig so that the runtime can see the full picture of Pod's resource usage at Pod creation time, for From dc1d8f92e0f660ba4f5c27234883eedc9fd9a2da Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 10 Oct 2024 00:41:16 +0300 Subject: [PATCH 25/26] KEP-4112: drop other keps from non-goals --- keps/sig-node/4112-passdown-resources-to-cri/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index c526f77f003..3ccfc3ee164 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -253,8 +253,6 @@ breaking any existing use cases. - change kubelet resource management - change existing behavior of CRI -- add UpdatePodSandboxResources CRI rpc (this is covered by [KEP-1287][kep-1287], [PR][kep-1287-beta-pr]) -- add pod-level resource requirements (this is covered by [KEP-2837][kep-2837], [PR][kep-2837-alpha-pr]) ## Proposal From b8818557d56b9e7aaecbf910c02a4fd2dc242335 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 3 Dec 2024 10:46:35 +0200 Subject: [PATCH 26/26] KEP-4112: update for v1.33 - update kep.yaml to target v1.33 - update references to kep-1287 and kep-2837 --- .../4112-passdown-resources-to-cri/README.md | 60 +++++++++---------- .../4112-passdown-resources-to-cri/kep.yaml | 4 +- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/keps/sig-node/4112-passdown-resources-to-cri/README.md b/keps/sig-node/4112-passdown-resources-to-cri/README.md index 3ccfc3ee164..502cc991d64 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/README.md +++ b/keps/sig-node/4112-passdown-resources-to-cri/README.md @@ -334,16 +334,15 @@ requirements. This make it possible for the CRI runtime to detect any changes in the pod resources that happen between the Pod creation and container creation in e.g. scenarios where in-place pod updates are involved. -[KEP-1287][kep-1287] Beta ([PR][kep-1287-beta-pr]) proposes to add new -UpdatePodSandboxResources rpc to the CRI API. If/when KEP-1287 is implemented -as proposed, the UpdatePodSandboxResources CRI message is updated to include -the resource information of all containers (aligning with -UpdateContainerResourcesRequest). +[KEP-1287][kep-1287] ([Issue][kep-1287-issue]) Beta in Kubernetes v1.32 +introduced UpdatePodSandboxResources rpc to the CRI API. The +UpdatePodSandboxResources CRI message is updated to include the resource +information of all containers (aligning with UpdateContainerResourcesRequest). -[KEP-2837][kep-2837] Alpha ([PR][kep-2837-alpha-pr]) proposes to add new -Pod-level resource requirements field to the PodSpec. This information will be -be added to the PodResourceConfig message, similar to the container resource -information, if/when KEP-2837 is implemented as proposed. +[KEP-2837][kep-2837] ([Issue][kep-2837-issue]) Alpha in Kubernetes v1.32 +introduced Pod-level resource requirements field to the PodSpec. The +PodResourceConfig message in the CRI API is updated to include the Pod-level +resource requirements. ### CRI API @@ -360,6 +359,12 @@ the resource information presented here and allocate resources for each container separately at container creation time with the `CreateContainer` request. +The Pod-level resources enhancement [KEP-2837][kep-2837] +([Issue][kep-2837-issue]) Alpha in Kubernetes v1.32 added new Pod-level +resource requirements field to the PodSpec. This information will is included +in the PodResourceConfig message, similar to the container-level resource +information. + ```diff message PodSandboxConfig { @@ -377,7 +382,13 @@ request. +// PodResourceConfig contains information of all resources requirements of +// the containers of a pod. +message PodResourceConfig { ++ // Resource configuration of all containers in the pod. + repeated ContainerResourceConfig containers = 1; ++ ++ // Kubernetes resource spec of the pod-level resource requirements. ++ // This is the pod-level resource requirements introduced in KEP-2837 ++ // (alpha in v1.32). ++ KubernetesResources kubernetes_resources = 2; +} +// ContainerResourceConfig contains information of all resource requirements of @@ -409,23 +420,6 @@ request. +} ``` -The Pod-level resources enhancement [KEP-2837][kep-2837] -([alpha PR][kep-2837-alpha-pr]) proposes to add new Pod-level resource -requirements fields to the PodSpec. This information will be be added to the -PodResourceConfig message, similar to the container resource information. - -```diff - message PodResourceConfig { - repeated ContainerResourceConfig containers = 1; -+ -+ // Kubernetes resource spec of the pod-level resource requirements. -+ KubernetesResources kubernetes_resources = 2; - } -``` - -The implementation if adding the KubernetesResources field to the -PodResourceConfig is synced with [KEP-2837][kep-2837]. - #### CreateContainer The ContainerConfig message (used in CreateContainer request) is extended to @@ -506,9 +500,9 @@ adding them. #### UpdatePodSandboxResources -The In-Place Update of Pod Resources ([KEP-1287][kep-1287]) Beta -([PR][kep-1287-beta-pr]) proposes to add new UpdatePodSandboxResources rpc to -inform the CRI runtime about the changes in the pod resources. +The In-Place Update of Pod Resources ([KEP-1287][kep-1287]) Beta in Kubernetes +v1.32 introduced new UpdatePodSandboxResources rpc to inform the CRI runtime +about the changes in the pod resources. The UpdatePodSandboxResourcesRequest message is extended similarly to the [PodSandboxConfig](#podsandboxconfig) message to contain information about @@ -1297,7 +1291,7 @@ SIG to get the process for these resources started right away. -[kep-1287]: https://github.com/kubernetes/enhancements/issues/1287 -[kep-1287-beta-pr]: https://github.com/kubernetes/enhancements/pull/4704 -[kep-2837]: https://github.com/kubernetes/enhancements/issues/2837 -[kep-2837-alpha-pr]: https://github.com/kubernetes/enhancements/pull/4678 +[kep-1287]: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/1287-in-place-update-pod-resources +[kep-1287-issue]: https://github.com/kubernetes/enhancements/issues/1287 +[kep-2837]: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2837-pod-level-resource-spec +[kep-2837-issue]: https://github.com/kubernetes/enhancements/issues/2837 diff --git a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml index 390557fb1c2..8f04f0eb00d 100644 --- a/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml +++ b/keps/sig-node/4112-passdown-resources-to-cri/kep.yaml @@ -21,11 +21,11 @@ stage: alpha # The most recent milestone for which work toward delivery of this KEP has been # done. This can be the current (upcoming) milestone, if it is being actively # worked on. -latest-milestone: "v1.32" +latest-milestone: "v1.33" # The milestone at which this feature was, or is targeted to be, at each stage. milestone: - alpha: "v1.32" + alpha: "v1.33" # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled