diff --git a/hack/verify-toc.sh b/hack/verify-toc.sh
index e4f84abc54f8..0d06e18690d7 100755
--- a/hack/verify-toc.sh
+++ b/hack/verify-toc.sh
@@ -47,6 +47,7 @@ echo "Checking table of contents are up to date..."
 find keps -name '*.md' \
     | grep -Fxvf hack/.notableofcontents \
     | xargs mdtoc --inplace --max-depth=5 --dryrun || (
-      echo "Table of content not up to date. If this failed silently and you are on mac, try 'brew install grep'"
+      echo "Table of content not up to date. Did you run 'make update-toc' ?"
+      echo "If this failed silently and you are on mac, try 'brew install grep'"
       exit 1
     )
diff --git a/keps/NNNN-kep-template/README.md b/keps/NNNN-kep-template/README.md
index 2d6e477767b7..18d6c1461139 100644
--- a/keps/NNNN-kep-template/README.md
+++ b/keps/NNNN-kep-template/README.md
@@ -720,6 +720,18 @@ This through this both in small and large cases, again with respect to the
 [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
 -->
 
+###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
+
+<!--
+Focus not just on happy cases, but primarily on more pathological cases
+(e.g. probes taking a minute instead of milliseconds, failed pods consuming resources, etc.).
+If any of the resources can be exhausted, how this is mitigated with the existing limits
+(e.g. pods per node) or new limits added by this KEP?
+
+Are there any tests that were run/should be run to understand performance characteristics better
+and validate the declared limits?
+-->
+
 ### Troubleshooting
 
 <!--
diff --git a/keps/prod-readiness/sig-api-machinery/1040.yaml b/keps/prod-readiness/sig-api-machinery/1040.yaml
index f1472fd5b277..a2d8408bec0a 100644
--- a/keps/prod-readiness/sig-api-machinery/1040.yaml
+++ b/keps/prod-readiness/sig-api-machinery/1040.yaml
@@ -1,3 +1,5 @@
 kep-number: 1040
 beta:
   approver: "@wojtek-t"
+stable:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-api-machinery/2885.yaml b/keps/prod-readiness/sig-api-machinery/2885.yaml
index 6673b70ae635..2ce6e53d5d8b 100644
--- a/keps/prod-readiness/sig-api-machinery/2885.yaml
+++ b/keps/prod-readiness/sig-api-machinery/2885.yaml
@@ -3,3 +3,5 @@ alpha:
         approver: "@deads2k"
 beta:
         approver: "@deads2k"
+stable:
+        approver: "@deads2k"
diff --git a/keps/prod-readiness/sig-api-machinery/2896.yaml b/keps/prod-readiness/sig-api-machinery/2896.yaml
index 557020278e94..7bedee39e1d6 100644
--- a/keps/prod-readiness/sig-api-machinery/2896.yaml
+++ b/keps/prod-readiness/sig-api-machinery/2896.yaml
@@ -3,3 +3,5 @@ alpha:
   approver: "@deads2k"
 beta:
   approver: "@deads2k"
+stable:
+  approver: "@deads2k"
diff --git a/keps/prod-readiness/sig-api-machinery/3352.yaml b/keps/prod-readiness/sig-api-machinery/3352.yaml
index caffcbade463..6b204c391767 100644
--- a/keps/prod-readiness/sig-api-machinery/3352.yaml
+++ b/keps/prod-readiness/sig-api-machinery/3352.yaml
@@ -1,3 +1,5 @@
 kep-number: 3352
 alpha:
   approver: "@deads2k"
+beta:
+  approver: "@deads2k"
diff --git a/keps/prod-readiness/sig-apps/3140.yaml b/keps/prod-readiness/sig-apps/3140.yaml
index d7d2dab9bb08..835883197d7b 100644
--- a/keps/prod-readiness/sig-apps/3140.yaml
+++ b/keps/prod-readiness/sig-apps/3140.yaml
@@ -3,3 +3,5 @@ alpha:
   approver: deads2k
 beta:
   approver: deads2k
+stable:
+  approver: deads2k
diff --git a/keps/prod-readiness/sig-apps/3715.yaml b/keps/prod-readiness/sig-apps/3715.yaml
new file mode 100644
index 000000000000..5337e49bbf44
--- /dev/null
+++ b/keps/prod-readiness/sig-apps/3715.yaml
@@ -0,0 +1,3 @@
+kep-number: 3715
+beta:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-auth/3325.yaml b/keps/prod-readiness/sig-auth/3325.yaml
index 44c129651c84..1a4c5fec6f94 100644
--- a/keps/prod-readiness/sig-auth/3325.yaml
+++ b/keps/prod-readiness/sig-auth/3325.yaml
@@ -1,3 +1,5 @@
 kep-number: 3325
 alpha:
   approver: "@deads2k"
+beta:
+  approver: "@deads2k"
diff --git a/keps/prod-readiness/sig-cli/2227.yaml b/keps/prod-readiness/sig-cli/2227.yaml
index 63b8e5301b86..192e0a1913dd 100644
--- a/keps/prod-readiness/sig-cli/2227.yaml
+++ b/keps/prod-readiness/sig-cli/2227.yaml
@@ -1,3 +1,7 @@
 kep-number: 2227
 alpha:
   approver: "@deads2k"
+beta:
+  approver: "@deads2k"
+stable:
+  approver: "@deads2k"
\ No newline at end of file
diff --git a/keps/prod-readiness/sig-instrumentation/2845.yaml b/keps/prod-readiness/sig-instrumentation/2845.yaml
index e35684c2f58e..c668bc3f05cc 100644
--- a/keps/prod-readiness/sig-instrumentation/2845.yaml
+++ b/keps/prod-readiness/sig-instrumentation/2845.yaml
@@ -3,3 +3,5 @@ alpha:
   approver: "@ehashman"
 beta:
   approver: "@ehashman"
+stable:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-instrumentation/3466.yaml b/keps/prod-readiness/sig-instrumentation/3466.yaml
index 7d54562759fc..276a49e84aed 100644
--- a/keps/prod-readiness/sig-instrumentation/3466.yaml
+++ b/keps/prod-readiness/sig-instrumentation/3466.yaml
@@ -4,3 +4,5 @@
 kep-number: 3466
 alpha:
   approver: "@wojtek-t"
+beta:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-instrumentation/3498.yaml b/keps/prod-readiness/sig-instrumentation/3498.yaml
index 5d67aaab7c2b..6d0b3fbd5fc3 100644
--- a/keps/prod-readiness/sig-instrumentation/3498.yaml
+++ b/keps/prod-readiness/sig-instrumentation/3498.yaml
@@ -4,3 +4,5 @@
 kep-number: 3498
 alpha:
   approver: "@wojtek-t"
+beta:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-network/3668.yaml b/keps/prod-readiness/sig-network/3668.yaml
new file mode 100644
index 000000000000..88e0d2e9f691
--- /dev/null
+++ b/keps/prod-readiness/sig-network/3668.yaml
@@ -0,0 +1,3 @@
+kep-number: 3668
+alpha:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-network/3705.yaml b/keps/prod-readiness/sig-network/3705.yaml
new file mode 100644
index 000000000000..6e6e5a5e6b97
--- /dev/null
+++ b/keps/prod-readiness/sig-network/3705.yaml
@@ -0,0 +1,6 @@
+# The KEP must have an approver from the
+# "prod-readiness-approvers" group 
+# of http://git.k8s.io/enhancements/OWNERS_ALIASES
+kep-number: 3705
+alpha:
+  approver: "wojtek-t"
diff --git a/keps/prod-readiness/sig-node/3673.yaml b/keps/prod-readiness/sig-node/3673.yaml
new file mode 100644
index 000000000000..6a5799761f6a
--- /dev/null
+++ b/keps/prod-readiness/sig-node/3673.yaml
@@ -0,0 +1,3 @@
+kep-number: 3673
+alpha:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-node/3675.yaml b/keps/prod-readiness/sig-node/3675.yaml
new file mode 100644
index 000000000000..50aceca7fa5a
--- /dev/null
+++ b/keps/prod-readiness/sig-node/3675.yaml
@@ -0,0 +1,3 @@
+kep-number: 3675
+alpha:
+  approver: "@wojtek-t"
\ No newline at end of file
diff --git a/keps/prod-readiness/sig-release/3720.yaml b/keps/prod-readiness/sig-release/3720.yaml
new file mode 100644
index 000000000000..76a21002ace9
--- /dev/null
+++ b/keps/prod-readiness/sig-release/3720.yaml
@@ -0,0 +1,3 @@
+kep-number: 3720
+stable:
+  approver: "@johnbelamaric"
diff --git a/keps/prod-readiness/sig-scheduling/2926.yaml b/keps/prod-readiness/sig-scheduling/2926.yaml
index fc1c753efdab..62695b074beb 100644
--- a/keps/prod-readiness/sig-scheduling/2926.yaml
+++ b/keps/prod-readiness/sig-scheduling/2926.yaml
@@ -1,3 +1,5 @@
 kep-number: 2926
 beta:
   approver: "@wojtek-t"
+stable:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-scheduling/3243.yaml b/keps/prod-readiness/sig-scheduling/3243.yaml
index f9dcfb3e2fa2..22553ea582a7 100644
--- a/keps/prod-readiness/sig-scheduling/3243.yaml
+++ b/keps/prod-readiness/sig-scheduling/3243.yaml
@@ -1,3 +1,5 @@
 kep-number: 3243
 alpha:
   approver: "@wojtek-t"
+beta:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-scheduling/3521.yaml b/keps/prod-readiness/sig-scheduling/3521.yaml
index 1d742177e4da..2bfab4bea96b 100644
--- a/keps/prod-readiness/sig-scheduling/3521.yaml
+++ b/keps/prod-readiness/sig-scheduling/3521.yaml
@@ -1,3 +1,5 @@
 kep-number: 3521
 alpha:
   approver: "@wojtek-t"
+beta:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-scheduling/3633.yaml b/keps/prod-readiness/sig-scheduling/3633.yaml
new file mode 100644
index 000000000000..82854df8212c
--- /dev/null
+++ b/keps/prod-readiness/sig-scheduling/3633.yaml
@@ -0,0 +1,3 @@
+kep-number: 3633
+alpha:
+  approver: "@wojtek-t"
diff --git a/keps/prod-readiness/sig-storage/2485.yaml b/keps/prod-readiness/sig-storage/2485.yaml
index 0bc29290f7a6..d2690eed18bb 100644
--- a/keps/prod-readiness/sig-storage/2485.yaml
+++ b/keps/prod-readiness/sig-storage/2485.yaml
@@ -1,3 +1,5 @@
 kep-number: 2485
 alpha:
   approver: "@ehashman"
+beta:
+  approver: "@deads2k"
diff --git a/keps/sig-api-machinery/1040-priority-and-fairness/README.md b/keps/sig-api-machinery/1040-priority-and-fairness/README.md
index eab2a8dd7eab..c14bfec17d99 100644
--- a/keps/sig-api-machinery/1040-priority-and-fairness/README.md
+++ b/keps/sig-api-machinery/1040-priority-and-fairness/README.md
@@ -66,7 +66,13 @@
   - [References](#references)
   - [Design Considerations](#design-considerations)
   - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
   - [Graduation Criteria](#graduation-criteria)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
 - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
   - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
   - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
@@ -82,25 +88,22 @@
 
 ## Release Signoff Checklist
 
-**ACTION REQUIRED:** In order to merge code into a release, there must be an issue in [kubernetes/enhancements] referencing this KEP and targeting a release milestone **before [Enhancement Freeze](https://github.com/kubernetes/sig-release/tree/master/releases)
-of the targeted release**.
-
-For enhancements that make changes to code or processes/procedures in core Kubernetes i.e., [kubernetes/kubernetes], we require the following Release Signoff checklist to be completed.
-
-Check these off as they are completed for the Release Team to track. These checklist items _must_ be updated for the enhancement to be released.
-
-- [x] kubernetes/enhancements issue in release milestone, which links to KEP (this should be a link to the KEP location in kubernetes/enhancements, not the initial KEP PR)
-- [x] KEP approvers have set the KEP status to `implementable`
-- [ ] Design details are appropriately documented
-- [ ] Test plan is in place, giving consideration to SIG Architecture and SIG Testing input
-- [ ] Graduation criteria is in place
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [x] (R) KEP approvers have set the KEP status to `implementable`
+- [x] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
 - [ ] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
-- [ ] Supporting documentation e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
-
-**Note:** Any PRs to move a KEP to `implementable` or significant changes once it is marked `implementable` should be approved by each of the KEP approvers. If any of those approvers is no longer appropriate than changes to that list should be approved by the remaining approvers and/or the owning SIG (or SIG-arch for cross cutting KEPs).
-
-**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+- [x] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [x] Supporting documentation e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 [kubernetes.io]: https://kubernetes.io/
 [kubernetes/enhancements]: https://github.com/kubernetes/enhancements/issues
@@ -109,10 +112,11 @@ Check these off as they are completed for the Release Team to track. These check
 
 ## Summary
 
-This KEP generalizes the existing max-in-flight request handler in the
-apiserver to make more distinctions among requests and provide
-prioritization and fairness among the categories of requests.  An
-outline of the request handling in an apiserver can be found at
+This KEP describes an alternative to the existing max-in-flight
+request handler in the apiserver that makes more distinctions among
+requests, and provides prioritization and fairness among the
+categories of requests, in a user-configurable way.  An outline of the
+request handling in an apiserver can be found at
 https://speakerdeck.com/sttts/kubernetes-api-codebase-tour?slide=18 .
 
 ## Motivation
@@ -2573,9 +2577,7 @@ There are likely others.
 
 ## Design Details
 
-We are still ironing out the high level goals and approach.  Several
-earlier proposals have been floated, as listed next.  This section
-contains a discussion of the issues.
+The [Proposal](#proposal) is fairly detailed.
 
 ### References
 
@@ -2806,9 +2808,43 @@ with miniscule quanta, are also isomorphic.
 
 ### Test Plan
 
-- __Unit Tests__: All changes must be covered by unit tests. Additionally,
- we need to test the evenness of dispatching algorithm.
-- __Integration Tests__: The use cases discussed in this KEP must be covered by integration tests.
+[x] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+Reviewers have not identified any additional tests that need to be
+added prior implementing this enhancement to ensure the enhancements
+have also solid foundations.
+
+##### Unit tests
+
+- `apiserver/pkg/apis/flowcontrol/bootstrap`: `23-02-02` - `100`
+- `apiserver/pkg/util/flowcontrol`: `23-02-02` - `76`
+- `apiserver/pkg/util/flowcontrol/fairqueuing`: `23-02-02` - `80.9`
+- `apiserver/pkg/util/flowcontrol/fairqueuing/eventclock`: `23-02-02` - `100`
+- `apiserver/pkg/util/flowcontrol/fairqueuing/promise`: `23-02-02` - `100`
+- `apiserver/pkg/util/flowcontrol/fairqueuing/queueset`: `23-02-02` - `81.6`
+- `apiserver/pkg/util/flowcontrol/request`: `23-02-02` - `87.5`
+- `apiserver/pkg/server/filters/priority-and-fairness.go`: `23-02-02` - `90.6`
+
+##### Integration tests
+
+The integration tests are in https://github.com/kubernetes/kubernetes/tree/master/test/integration/apiserver/flowcontrol .  Testgrid results for them can be seen at https://testgrid.k8s.io/sig-release-master-blocking#integration-master&include-filter-by-regex=apiserver%2Fflowcontrol .
+
+- TestConcurrencyIsolation
+- TestConditionIsolation
+- TestConfigConsumerFight
+- TestPriorityLevelIsolation
+
+##### e2e tests
+
+The end-to-end tests are in
+https://github.com/kubernetes/kubernetes/blob/master/test/e2e/apimachinery/flowcontrol.go
+.
+
+- "API priority and fairness": https://storage.googleapis.com/k8s-triage/index.html?sig=api-machinery&test=API%20priority%20and%20fairness
 
 ### Graduation Criteria
 
@@ -2834,149 +2870,302 @@ Beta:
 
 GA:
 
-- Satisfaction with LIST and WATCH support.
-- API annotations properly support strategic merge patch.
-- APF allows us to disable client-side rate limiting without causing the apiservers to wedge/crash.  Note that there is another level of concern that APF does not attempt to address, which is mismatch between the throughput that various controllers can sustain.
-- Design and implement borrowing between priority levels.
-- Satisfaction that the API is sufficient to support auto-tuning of capacity and resource costs.
+- [x] API annotations properly support strategic merge patch.
+- [x] Design and implement borrowing between priority levels.
+- [ ] Satisfaction with LIST and WATCH support.
+- [ ] APF allows us to disable client-side rate limiting without causing the apiservers to wedge/crash.  Note that there is another level of concern that APF does not attempt to address, which is mismatch between the throughput that various controllers can sustain.
+- [ ] Satisfaction that the interface is sufficient to support tuning of capacity and resource costs.
+
+### Upgrade / Downgrade Strategy
+
+<!--
+If applicable, how will the component be upgraded and downgraded? Make sure
+this is in the test plan.
+
+Consider the following in developing an upgrade/downgrade strategy for this
+enhancement:
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to maintain previous behavior?
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to make use of the enhancement?
+-->
+
+### Version Skew Strategy
+
+This feature affects only the kube-apiserver, so there is no issue
+with version skew with other components.
 
 ## Production Readiness Review Questionnaire
 
 ### Feature Enablement and Rollback
 
-* **How can this feature be enabled / disabled in a live cluster?** To enable
-  priority and fairness, all of the following must be enabled:
-  - [x] Feature gate
-    - Feature gate name: APIPriorityAndFairness
-    - Components depending on the feature gate:
-      - kube-apiserver
-  - [x] Command-line flags
-    - `--enable-priority-and-fairness`, and
-    - `--runtime-config=flowcontrol.apiserver.k8s.io/v1alpha1=true`
+###### How can this feature be enabled / disabled in a live cluster?
+
+- [x] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: APIPriorityAndFairness
+  - Components depending on the feature gate:
+    - kube-apiserver
+- [x] Other
+  - Describe the mechanism: command-line flag
+    `enable-priority-and-fairness`, which defaults to `true`.
+  - Will enabling / disabling the feature require downtime of the
+    control plane?  Changing a command-line flag requires restarting
+    the kube-apiserver.
+  - Will enabling / disabling the feature require downtime or
+    reprovisioning of a node? (Do not assume `Dynamic Kubelet Config`
+    feature is enabled).  No.
 
-* **Does enabling the feature change any default behavior?** Yes, requests that
-  weren't rejected before could get rejected while requests that were rejected
-  previously may be allowed. Performance of kube-apiserver under heavy load
-  will likely be different too.
+###### Does enabling the feature change any default behavior?
 
-* **Can the feature be disabled once it has been enabled (i.e. can we roll back
-  the enablement)?** Yes.
+Yes, requests that weren't rejected before could get rejected while
+requests that were rejected previously may be allowed. Performance of
+kube-apiserver under heavy load will likely be different too.
 
-* **What happens if we reenable the feature if it was previously rolled back?**
-  The feature will be restored.
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
 
-* **Are there any tests for feature enablement/disablement?** No. Manual tests
-  will be run before switching feature gate to beta.
+Yes.
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+The feature will be restored.
+
+###### Are there any tests for feature enablement/disablement?
+
+No.  Manual tests were run before switching feature gate to beta.
 
 ### Rollout, Upgrade and Rollback Planning
 
-* **How can a rollout fail? Can it impact already running workloads?** A
-  misconfiguration could cause apiserver requests to be rejected, which could
-  have widespread impact such as: (1) rejecting controller requests, thereby
-  bringing a lot of things to a halt, (2) dropping node heartbeats, which may
-  result in overloading other nodes, (3) rejecting kube-proxy requests to
-  apiserver, thereby breaking existing workloads, (4) dropping leader election
-  requests, resulting in HA failure, or any combination of the above.
-
-* **What specific metrics should inform a rollback?** An abnormal spike in the
-  `apiserver_flowcontrol_rejected_requests_total` metric should potentially be
-  viewed as a sign that kube-apiserver is rejecting requests, potentially
-  incorrectly. The `apiserver_flowcontrol_request_queue_length_after_enqueue`
-  metric getting too close to the configured queue length could be a sign of
-  insufficient queue size (or a system overload), which can be precursor to
-  rejected requests.
-
-* **Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?**
-  No. Manual tests will be run before switching feature gate to beta.
-
-* **Is the rollout accompanied by any deprecations and/or removals of features, APIs, 
-  fields of API types, flags, etc.?** Yes, `--max-requests-inflights` will be
-  deprecated in favor of APF.
+###### How can a rollout or rollback fail? Can it impact already running workloads?
 
-### Monitoring Requirements
+A misconfiguration could cause apiserver requests to be rejected,
+which could have widespread impact such as: (1) rejecting controller
+requests, thereby bringing a lot of things to a halt, (2) dropping
+node heartbeats, which may result in overloading other nodes, (3)
+rejecting kube-proxy requests to apiserver, thereby breaking existing
+workloads, (4) dropping leader election requests, resulting in HA
+failure, or any combination of the above.
 
-* **How can an operator determine if the feature is in use by workloads?**
-  If the `apiserver_flowcontrol_dispatched_requests_total` metric is non-zero,
-  this feature is in use. Note that this isn't a workload feature, but a
-  control plane one.
+###### What specific metrics should inform a rollback?
 
-* **What are the SLIs (Service Level Indicators) an operator can use to determine 
-the health of the service?**
-  - [x] Metrics
-    - Metric name: `apiserver_flowcontrol_request_queue_length_after_enqueue`
-    - Components exposing the metric: kube-apiserver
+See [the remarks on service health](#what-are-the-slis-service-level-indicators-an-operator-can-use-to-determine-the-health-of-the-service).
 
-* **What are the reasonable SLOs (Service Level Objectives) for the above SLIs?**
-  No SLOs are proposed for the above SLI.
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+Manual tests were run during promotion to Beta in 1.20.
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+The rollout replaces the behavior of the max-in-flight filter with the
+more sophisticated behavior of this feature.
+
+### Monitoring Requirements
 
-* **Are there any missing metrics that would be useful to have to improve observability 
-of this feature?** No.
+###### How can an operator determine if the feature is in use by workloads?
+
+If the `apiserver_flowcontrol_dispatched_requests_total` metric is
+non-zero, this feature is in use. Note that this isn't a workload
+feature, but a control plane one.
+
+###### How can someone using this feature know that it is working for their instance?
+
+The classification functionality can be reviewed by looking at the
+`apf_pl` and `apf_fs` attributes that appear in the httplog lines from
+the apiserver when `-v=3` or more verbose. The classification is also
+reported in each HTTP response via the
+`X-Kubernetes-PF-PriorityLevel-UID` and
+`X-Kubernetes-PF-FlowSchema-UID` headers, respectively.
+
+The nominal division of concurrency can be checked by looking at the
+`apiserver_flowcontrol_nominal_limit_seats` metric of the limited
+priority levels to see whether they are in the proportions given by
+the `NominalConcurrencyShares` fields in the corresponding
+`PriorityLevelConfiguration` objects, and sum to the server's
+concurrency limit (the sum of the `--max-requests-inflight` and
+`--max-mutating-requests-inflight` command line parameters) allowing
+for rounding error.
+
+The dynamic adjustment of concurrency limits can be checked for
+validity by checking whether the
+`apiserver_flowcontrol_current_limit_seats` metrics sum to the
+server's concurrency limit allowing for rounding error.  The
+responsiveness to load can be checked by comparing those dynamic
+concurrency limits to various indications of load and keeping in mind
+the bounds shown by the `apiserver_flowcontrol_lower_limit_seats` and
+`apiserver_flowcontrol_upper_limit_seats` metrics.  Indications of
+load include the `apiserver_flowcontrol_request_wait_duration_seconds`
+histogram vector metric and the
+`apiserver_flowcontrol_demand_seats_smoothed` gauge vector metric.
+
+The dispatching can be checked by checking whether there are
+significant queuing delays only for flow schemeas that go to priority
+levels whose seats are highly utilized.  The histogram vector metric
+`apiserver_flowcontrol_request_wait_duration_seconds` shows the
+former, and tThe histogram vector metric
+`apiserver_flowcontrol_priority_level_seat_utilization` shows the
+latter.
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+None have been identified.
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+Since this feature is about protecting the server from overload,
+trouble in this feature and actual overload can show similar symptoms.
+Following are some thoughts about how to tell them apart.
+
+- high latency / long queue lengths + low request volume / low CPU
+  usage = probably APF misconfiguration or bug
+
+- high latency / long queue lengths + high request volume / high CPU
+  usage = probably APF is saving your cluster from an outage
+
+The following metrics give relevant clues.
+
+- `apiserver_flowcontrol_request_wait_duration_seconds` shows queuing
+  delay in APF.
+- `apiserver_flowcontrol_request_execution_seconds` shows execution
+  durations in APF; that summed with
+  `apiserver_flowcontrol_request_wait_duration_seconds` shows total
+  handling time covered by APF.
+- The `apiserver_request_duration_seconds_count` histogram shows total
+  request delay as measured in a handler tha encloses APF, grouped by
+  request characteristics.
+- The rate of change in
+  `apiserver_flowcontrol_request_execution_seconds_count` shows the
+  rates of request handling, grouped by flow schema.
+- The rate of change in `apiserver_request_total` shows the rates at
+  which requests are being handled, grouped by request
+  characteristics.
+- the `apiserver_flowcontrol_current_inqueue_requests` gauge vector
+  samples queue lengths for each flow schema.
+- the `apiserver_flowcontrol_priority_level_seat_utilization`
+  histogram vector summarizes per-nanosecond samples of the fraction
+  each priority level's seats that are occupied (these should not all
+  be low if the server is heavily loaded).
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+No.
 
 ### Dependencies
 
-* **Does this feature depend on any specific services running in the cluster?**
-  No.
+###### Does this feature depend on any specific services running in the cluster?
+
+No
 
 ### Scalability
 
-* **Will enabling / using this feature result in any new API calls?** Yes.
-  Self-requests for new API objects will be introduced. In addition, the
-  request execution order may change, which could occasionally increase the
-  number of retries.
+###### Will enabling / using this feature result in any new API calls?
 
-* **Will enabling / using this feature result in introducing new API types?**
-  Yes, a new flowcontrol API group, configuration types, and status types are
-  introduced. See `k8s.io/api/flowcontrol/v1alpha1/types.go` for a full list.
+Yes.  Self-requests for new API objects will be introduced. In
+addition, the request execution order may change, which could
+occasionally increase the number of retries.
 
-* **Will enabling / using this feature result in any new calls to the cloud
-  provider?** No.
+###### Will enabling / using this feature result in introducing new API types?
 
-* **Will enabling / using this feature result in increasing size or count of
-  the existing API objects?** No.
+Yes, a new flowcontrol API group, configuration types, and status
+types are introduced. See `k8s.io/api/flowcontrol/v1alpha1/types.go`
+for a full list.
 
-* **Will enabling / using this feature result in increasing time taken by any
-  operations covered by [existing SLIs/SLOs]?** Yes, a non-negligible latency
-  is added to API calls to kube-apiserver. While [preliminary tests](https://github.com/tkashem/graceful/blob/master/priority-fairness/filter-latency/readme.md)
-  shows that the API server latency is still well within the existing SLOs,
-  more thorough testing needs to be performed.
+###### Will enabling / using this feature result in any new calls to the cloud provider?
 
-* **Will enabling / using this feature result in non-negligible increase of
-  resource usage (CPU, RAM, disk, IO, ...) in any components?** The proposed
-  flowcontrol logic in request handling in kube-apiserver will increase the CPU
-  and memory overheads involved in serving each request. Note that the resource
-  usage will be configurable and may require the operator to fine-tune some
-  parameters.
+No.
 
-### Troubleshooting
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
 
-* **How does this feature react if the API server and/or etcd is unavailable?**
-  The feature is itself within the API server. Etcd being unavailable would
-  likely cause kube-apiserver to fail at processing incoming requests.
+No.
 
-* **What are other known failure modes?** A misconfiguration could reject
-  requests incorrectly. See the rollout and monitoring sections for details on
-  which metrics to watch to detect such failures (see the `kep.yaml` file for
-  the full list of metrics). The following kube-apiserver log messages could
-  also indicate potential issues:
-  - "Unable to list PriorityLevelConfiguration objects"
-  - "Unable to list FlowSchema objects"
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
 
-* **What steps should be taken if SLOs are not being met to determine the
-  problem?** No SLOs are proposed.
+Yes, a non-negligible latency is added to API calls to
+kube-apiserver. While [preliminary
+tests](https://github.com/tkashem/graceful/blob/master/priority-fairness/filter-latency/readme.md)
+shows that the API server latency is still well within the existing
+SLOs, more thorough testing needs to be performed.
 
-[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
-[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
 
-## Implementation History
+The proposed flowcontrol logic in request handling in kube-apiserver
+will cause a small increase the CPU and memory overheads involved in
+serving each request when there is no significant queuing. When there
+_is_ significant queuing, it is because this feature is doing its
+intended function of smoothing out usage of many resources by limiting
+the number of requests being actively handled at once.  Holding
+requests in queues involves an added memory cost. Note that the
+resource usage will be configurable and may require the operator to
+fine-tune some parameters.
+
+###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
+
+This feature does not directly consume any resources on worker nodes.
+It can have indirect effects, through the changes in how the
+kube-apiserver serves requests.
 
+### Troubleshooting
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+This feature is entirely in the apiserver, so will be unavailable when
+the apiserver is unavailable.
+
+This feature does not depend directly on the storage and will continue
+to perform its function while the storage is unavailable, except for
+the parts of the function that involve maintaining its configuration
+objects and reacting to changes in them.  While the storage is
+unavailable this feature continues to use the last configuration that
+it read.  Unavailability of storage will naturally have severe effects
+on request executions and this feature will react to the relevant
+consequences.
+
+###### What are other known failure modes?
+
+  - Some client makes requests with big responses that the client does
+    not finish reading in the first minute, constantly occupying (hand
+    size) X (max width = 10) seats --- which nearly starves other
+    clients when the priority level's concurrency limit is not much
+    greater than that.
+    - Detection: How can it be detected via metrics? In this situation
+      the requests from the bad client will timeout during execution
+      at a rate of (hand size)/minute, and other requests in the same
+      priority level will almost all timeout while waiting in a queue.
+    - Mitigations: What can be done to stop the bleeding, especially
+      for already running user workloads?  Identify the bad client and
+      create a special flow schema and priority level to isolate that
+      client.
+    - Diagnostics: What are the useful log messages and their required
+      logging levels that could help debug the issue?  In the
+      kube-apiserver's log at `-v=3` or more verbose the httplog
+      entries will indicate timeouts.
+    - Testing: Are there any tests for failure mode? If not, describe
+      why.  We only recently realized this failure mode and we hope to
+      make a change that will prevent it or automatically mitigate it.
+      See https://github.com/kubernetes/kubernetes/issues/115409 .
+
+  - A misconfiguration could reject
+    requests incorrectly.
+    - Detection: See the rollout and monitoring sections for details
+      on which metrics to watch to detect such failures (see the
+      `kep.yaml` file for the full list of metrics). The following
+      kube-apiserver log messages could also indicate potential
+      issues:
+      - "Unable to list PriorityLevelConfiguration objects"
+      - "Unable to list FlowSchema objects"
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+No SLOs are proposed.
 
-- v1.19: `Alpha` release
+## Implementation History
+
+- v1.18: `Alpha` release
 - v1.20: graduated to `Beta`
 - v1.22: initial support for width concept and watch initialization
 - v1.23: introduce `v1beta2` API
   - no changes compared to `v1beta1`
   - `v1beta1` remain as storage version
 - v1.24: storage version changed to `v1beta2`
+- v1.26: introduce concurrency borrowing between priority levels
 
 ## Drawbacks
 
@@ -2995,6 +3184,5 @@ designs not chosen.
 
 ## Infrastructure Needed
 
-The end-to-end test suite should exercise the functionality introduced
-by this KEP.  This may require creating a special client to submit an
-overload of low-priority work.
+Nothing beyond the usual CI.
+
diff --git a/keps/sig-api-machinery/1040-priority-and-fairness/kep.yaml b/keps/sig-api-machinery/1040-priority-and-fairness/kep.yaml
index befebd21bb18..55f16b8c3932 100644
--- a/keps/sig-api-machinery/1040-priority-and-fairness/kep.yaml
+++ b/keps/sig-api-machinery/1040-priority-and-fairness/kep.yaml
@@ -10,6 +10,7 @@ participating-sigs:
 - wg-multitenancy
 - sig-scheduling
 status: implementable
+creation-date: 2019-02-28
 reviewers:
 - "@deads2k"
 - "@lavalamp"
@@ -19,21 +20,20 @@ reviewers:
 approvers:
 - "@deads2k"
 - "@lavalamp"
-creation-date: 2019-02-28
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: beta
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.23"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.18"
   beta: "v1.20"
-  stable: "v1.23"
+  stable: "v1.27"
 
 # The following PRR answers are required at alpha release.
 # List the feature gate name and the components for which it must be enabled.
@@ -47,9 +47,33 @@ disable-supported: true
 metrics:
 - apiserver_flowcontrol_rejected_requests_total
 - apiserver_flowcontrol_dispatched_requests_total
+- apiserver_flowcontrol_priority_level_seat_utilization
+- apiserver_flowcontrol_priority_level_request_utilization
+- apiserver_flowcontrol_read_vs_write_current_requests
+- apiserver_flowcontrol_current_r
+- apiserver_flowcontrol_dispatch_r
+- apiserver_flowcontrol_latest_s
+- apiserver_flowcontrol_next_s_bounds
+- apiserver_flowcontrol_next_discounted_s_bounds
 - apiserver_flowcontrol_current_inqueue_requests
 - apiserver_flowcontrol_request_queue_length_after_enqueue
 - apiserver_flowcontrol_request_concurrency_limit
 - apiserver_flowcontrol_current_executing_requests
+- apiserver_flowcontrol_request_concurrency_in_use
 - apiserver_flowcontrol_request_wait_duration_seconds
 - apiserver_flowcontrol_request_execution_seconds
+- apiserver_flowcontrol_watch_count_samples
+- apiserver_flowcontrol_epoch_advance_total
+- apiserver_flowcontrol_work_estimated_seats
+- apiserver_flowcontrol_request_dispatch_no_accommodation_total
+- apiserver_flowcontrol_nominal_limit_seats
+- apiserver_flowcontrol_lower_limit_seats
+- apiserver_flowcontrol_upper_limit_seats
+- apiserver_flowcontrol_demand_seats
+- apiserver_flowcontrol_demand_seats_high_watermark
+- apiserver_flowcontrol_demand_seats_average
+- apiserver_flowcontrol_demand_seats_stdev
+- apiserver_flowcontrol_demand_seats_smoothed
+- apiserver_flowcontrol_target_seats
+- apiserver_flowcontrol_seat_fair_frac
+- apiserver_flowcontrol_current_limit_seats
diff --git a/keps/sig-api-machinery/2885-server-side-unknown-field-validation/README.md b/keps/sig-api-machinery/2885-server-side-unknown-field-validation/README.md
index 5c21990c662e..5d2fc7449907 100644
--- a/keps/sig-api-machinery/2885-server-side-unknown-field-validation/README.md
+++ b/keps/sig-api-machinery/2885-server-side-unknown-field-validation/README.md
@@ -105,6 +105,8 @@ tags, and then generate with `hack/update-toc.sh`.
   - [Graduation Criteria](#graduation-criteria)
     - [Alpha](#alpha)
     - [Beta](#beta)
+    - [GA](#ga)
+  - [Version Skew Strategy](#version-skew-strategy)
 - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
   - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
   - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
@@ -313,11 +315,12 @@ client-side validation, albeit one that is error-prone and not officially
 supported).
 
 Long-term, we want to favor using out-of-tree solutions for client-side
-validation, though this idea is still in its infancy.
+validation. The [kubeconform](https://github.com/yannh/kubeconform) project is an example of an out-of-tree solution that does this.
 
-The [kubeval](https://www.kubeval.com/) project is an example of an out-of-tree solution that does this, and
-we will look into expanding its support of open API to v3, and investigate
-whether it makes sense as a permanent solution to client-side validation.
+SIG API Machinery's effort will go to producing clear and complete (and
+improving over time) API spec documents insteading of making client side
+validation. We strongly recommend that third part integrators make use of such
+documents.
 
 ##### Aligning json and yaml errors
 
@@ -615,6 +618,11 @@ It tests the cross product of all valid permutations along the dimensions of:
 With field validation on by default in beta, we will modify
 [test/e2e/kubectl/kubectl.go](https://github.com/kubernetes/kubernetes/blob/master/test/e2e/kubectl/kubectl.go) to ensure that kubectl defaults to using server side field validation and detects unknown/duplicate fields as expected.
 
+[GA]
+We will introduce field validation specific e2e/conformance tests to submit
+requests directly against the API server for both built-in and custom resources
+to test that duplicate and unknown fields are appropriately detected.
+
 ### Graduation Criteria
 <!--
 **Note:** *Not required until targeted at a release.*
@@ -655,29 +663,23 @@ Below are some examples to consider, in addition to the aforementioned [maturity
 - [x] field validation integration tests check for exact match of strict errors
 - [x] In tree NestedObjectDecoders no longer short circuit on strict decoding
   errors [#107545](https://github.com/kubernetes/kubernetes/issues/107545)
-- [ ] Unknown/Duplicate fields are properly detected in the metadata at both the
+- [x] Unknown/Duplicate fields are properly detected in the metadata at both the
   root level and within embedded objects
   [#109215](https://github.com/kubernetes/kubernetes/issues/109215),[#109316](https://github.com/kubernetes/kubernetes/pull/109316), and
   [#109494](https://github.com/kubernetes/kubernetes/pull/109494)
 
 
-<!--
 #### GA
 
-- N examples of real-world usage
-- N installs
-- More rigorous forms of testing—e.g., downgrade tests and scalability tests
-- Allowing time for feedback
-
-**Note:** Generally we also wait at least two releases between beta and
-GA/stable, because there's no opportunity for user feedback, or even bug reports,
-in back-to-back releases.
-
-**For non-optional features moving to GA, the graduation criteria must include
-[conformance tests].**
-
-[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
+- [x] Wait two releases (1.25 and 1.26) with field validation enabled by default
+  and receive minimal bug reports pertaining to the feature. As a data point,
+  GKE clusters running field validation by default in 1.25 are seeing less CPU
+  and memory overall vs 1.24, supporting the notion that this feature has not
+  significantly impacted performance.
+- [] Add conformance testing to invoke field validation directly
+  against the API server
 
+<!--
 #### Deprecation
 
 - Announce deprecation and support policy of the existing flag
@@ -696,20 +698,30 @@ enhancement:
   cluster required to make on upgrade, in order to maintain previous behavior?
 - What changes (in invocations, configurations, API use, etc.) is an existing
   cluster required to make on upgrade, in order to make use of the enhancement?
+  -->
 
 ### Version Skew Strategy
 
-If applicable, how will the component handle version skew with other
-components? What are the guarantees? Make sure this is in the test plan.
+Following the graduation of server-side field validation to GA, there will be a
+period of time when a newer client (which expects server-side field validation locked in GA) will send
+requests to an older server that could have server-side field validation
+disabled.
 
-Consider the following in developing a version skew strategy for this
-enhancement:
-- Does this enhancement involve coordinating behavior in the control plane and
-  in the kubelet? How does an n-2 kubelet without this feature available behave
-  when this feature is used?
-- Will any other components on the node change? For example, changes to CSI,
-  CRI or CNI may require updating that component before the kubelet.
--->
+Until version skew makes it incompatible for a client to hit a server
+with field validation disabled, kubectl must continue to check if the feature is
+enabled server-side. As long as the feature can be disabled server-side, kubectl
+must continue to have client-side validation. When the feature can't be disabled
+server-side, kubectl should remove client-side validation entirely.
+
+Given that kubectl [supports](https://kubernetes.io/releases/version-skew-policy/#kubectl) one minor version skew (older or newer) of kube-apipserver,
+this would mean that removing client-side validation in 1.28 would be
+1-version-skew-compatible with GA in 1.27 (when server-side validation can no
+longer be disabled).
+
+See section on [Out-of-Tree
+Alternatives to Client Side
+Validation](#out-of-tree-alternatives-to-client-side-validation) for
+alternatives to client-side validation
 
 ## Production Readiness Review Questionnaire
 
@@ -749,6 +761,8 @@ Pick one of these and delete the rest.
 - [x] Feature gate (also fill in values in `kep.yaml`)
   - Feature gate name: ServerSideFieldValidation
   - Components depending on the feature gate: kube-apiserver
+  - Note: feature gate locked to true upon graduation to GA (1.27) and removed
+    two releases later (1.29)
 - [x] Other
   - Describe the mechanism: query parameter
   - Will enabling / disabling the feature require downtime of the control
@@ -851,8 +865,15 @@ Recall that end users cannot usually observe component logs or access metrics.
 
 ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
 
-Users should expect to see an increase in request latency and memory usage
-(~20-25% increase) for requests that desire server side schema validation.
+We have benchmark tests for field validation
+[here](https://github.com/kubernetes/kubernetes/blob/master/test/integration/apiserver/field_validation_test.go#L2936-L2996) and an analysis of those
+benchmarks summarized
+[here](https://github.com/kubernetes/kubernetes/pull/107848#issuecomment-1032216698).
+
+Based on the above, users should should expect to see negligible latency increases (with the
+exception of SMP requests that are ~5% slower), and memory that is negligible
+for JSON (and no change for protobuf), but around 10-20% more memory usage for
+YAML data.
 
 Cluster operators can also use cpu usage monitoring to determine whether
 increased usage of server-side schema validation dramatically increases CPU usage after feature enablement.
diff --git a/keps/sig-api-machinery/2885-server-side-unknown-field-validation/kep.yaml b/keps/sig-api-machinery/2885-server-side-unknown-field-validation/kep.yaml
index 3e69f60e4bba..26528b7ff4cd 100644
--- a/keps/sig-api-machinery/2885-server-side-unknown-field-validation/kep.yaml
+++ b/keps/sig-api-machinery/2885-server-side-unknown-field-validation/kep.yaml
@@ -18,21 +18,22 @@ see-also:
 replaces:
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: beta
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.25"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.23"
   beta: "v1.25"
-  stable: "v1.26"
+  stable: "v1.27"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
+# Note: feature-gate removed upon graduation to GA in 1.27
 feature-gates:
   - name: ServerSideFieldValidation
     components:
diff --git a/keps/sig-api-machinery/2896-openapi-v3/README.md b/keps/sig-api-machinery/2896-openapi-v3/README.md
index 43b2a0a8cc62..48e7b257950d 100644
--- a/keps/sig-api-machinery/2896-openapi-v3/README.md
+++ b/keps/sig-api-machinery/2896-openapi-v3/README.md
@@ -63,6 +63,7 @@ Architecture for cross-cutting KEPs). -->
 - [Proposal](#proposal)
   - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
     - [Future Work](#future-work)
+    - [OpenAPI V2 plan](#openapi-v2-plan)
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
   - [Paths](#paths)
@@ -73,10 +74,12 @@ Architecture for cross-cutting KEPs). -->
   - [OpenAPI](#openapi)
   - [Version Skew](#version-skew)
   - [OpenAPI V3 Proto](#openapi-v3-proto)
+  - [Clients](#clients)
   - [Test Plan](#test-plan)
   - [Graduation Criteria](#graduation-criteria)
     - [Alpha](#alpha)
     - [Beta](#beta)
+    - [GA](#ga)
   - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
   - [Version Skew Strategy](#version-skew-strategy)
 - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
@@ -209,6 +212,17 @@ endpoints.
 After OpenAPI v3 is implemented, we can start working on updating clients to
 consume OpenAPI v3. A separate KEP will be published for that.
 
+#### OpenAPI V2 plan
+
+As OpenAPI V3 becomes more and more available and easily accessible,
+clients should move to OpenAPI V3 in favor of OpenAPI V2. This should
+result in a steady decline in the number of requests sent to the
+OpenAPI V2 endpoint. In the short term, we will make OpenAPI V2 make
+computations more lazily, deferring the aggregation and marshaling
+process to be on demand rather than at a set interval. This is not
+directly part of OpenAPI V3's GA plan, but is something we will be
+monitoring closely once OpenAPI V3 is GA.
+
 ### Risks and Mitigations
 
 Aggregation for OpenAPI v2 already consumes a non-negligible amount of
@@ -277,6 +291,12 @@ updates. If there is a race and the client passes in an outdated etag
 value, the server will send a 301 to redirect the client to the URL
 with the latest etag value.
 
+To ensure that the client cache does not grow in an unbounded manner,
+the client-go disk cache will be replaced with a cmopactable disk
+cache which performs occasional cleanup of old cached files. Refer to
+[PR](https://github.com/kubernetes/kubernetes/pull/109701) for more
+details.
+
 ### Controllers
 
 #### OpenAPI Builder
@@ -383,6 +403,16 @@ This solves the immediate protobuf issue but adds complexity to the OpenAPI sche
 
 The final alternative is to upgrade to OpenAPI 3.1 where the new JSON Schema version it is based off of supports fields alongside a `$ref`. However, OpenAPI does not follow semvar and 3.1 is a major upgrade over 3.0 and introduces various backwards incompatible changes. Furthermore, library support is currently lacking (gnostic) and doesn't fully support OpenAPI 3.1. One important backwards incompatible change is the removal of the nullable field and replacing it by changing the type field from a single string to an array of strings.
 
+### Clients
+
+Updating all OpenAPI V2 clients to use OpenAPI V3 is outside the scope
+of this KEP and part of future work. However, some key clients will be
+updated to use OpenAPI V3 as part of GA. One example is [OpenAPI V3
+for kubectl
+explain](https://github.com/kubernetes/enhancements/blob/master/keps/sig-cli/3515-kubectl-explain-openapiv3/README.md).
+Server Side Apply will also be updated to directly use the OpenAPI V3
+schema rather than OpenAPI V2. Finally, the query param verifier in
+kubectl will be updated to use OpenAPI V3.
 
 ### Test Plan
 
@@ -407,6 +437,8 @@ For alpha, unit tests will be included to ensure that v3 schemas are properly
 generated and published. A validator will also be used to ensure that the spec
 generated is valid OpenAPI v3.
 
+A fuzzer and performance benchmark tests will also be included.
+
 ### Graduation Criteria
 
 #### Alpha
@@ -420,13 +452,20 @@ generated is valid OpenAPI v3.
   - Incorrect OpenAPI polymorphic types (IntOrString, Quantity) are updated to use `anyOf` in OpenAPI V3
 - Definition names of native resources are updated to omit their package paths
 - Parameters are reused as components
-- `kubectl explain` to support using the OpenAPI v3 Schema
 - Aggregated API servers are queried for their v2 endpoint and converted to
   publish v3 if they do not directly publish v3
 - Heuristics are used for the OpenAPI v2 to v3 conversion to maximize
   correctness of published spec
 - Aggregation for OpenAPI v3 will serve as a proxy to downstream OpenAPI paths
 
+#### GA
+
+- OpenAPI V3 uses the optimized JSON marshaler for increased performance. See [link](https://github.com/kubernetes/kube-openapi/pull/319) for the benefits with OpenAPI V2.
+- [Updated OpenAPI V3 Client Interface](https://docs.google.com/document/d/1HmqJH-yyK8WyU8V1y5z-RyMLq9-Xe8JfTTuaNkm9GZ0)
+- Direct conversion from OpenAPI V3 to SMD schema for Server Side Apply
+- HTTP Disk Cache is replaced with Compactable Disk Cache
+- Conformance tests
+
 ### Upgrade / Downgrade Strategy
 
 <!-- If applicable, how will the component be upgraded and downgraded? Make sure
diff --git a/keps/sig-api-machinery/2896-openapi-v3/kep.yaml b/keps/sig-api-machinery/2896-openapi-v3/kep.yaml
index 429d997f864c..feeaed22775e 100644
--- a/keps/sig-api-machinery/2896-openapi-v3/kep.yaml
+++ b/keps/sig-api-machinery/2896-openapi-v3/kep.yaml
@@ -1,4 +1,4 @@
-title: KEP Template
+title: OpenAPI V3
 kep-number: 2896
 authors:
   - "@jefftree"
@@ -14,17 +14,18 @@ approvers:
   - "@lavalamp"
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: beta
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.24"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.23"
   beta: "v1.24"
+  stable: "v1.27"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
diff --git a/keps/sig-api-machinery/3157-watch-list/README.md b/keps/sig-api-machinery/3157-watch-list/README.md
index 3e21d0d4571a..e5461a1e9ca5 100644
--- a/keps/sig-api-machinery/3157-watch-list/README.md
+++ b/keps/sig-api-machinery/3157-watch-list/README.md
@@ -85,7 +85,7 @@ tags, and then generate with `hack/update-toc.sh`.
 - [Proposal](#proposal)
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
-  - [Required changes for a WATCH request with the RV=&quot;&quot; and the ResourceVersionMatch=MostRecent](#required-changes-for-a-watch-request-with-the-rv-and-the-resourceversionmatchmostrecent)
+  - [Required changes for a WATCH request with the SendInitialEvents=true](#required-changes-for-a-watch-request-with-the-sendinitialeventstrue)
     - [API changes](#api-changes)
     - [Important optimisations](#important-optimisations)
     - [Manual testing without the changes in place](#manual-testing-without-the-changes-in-place)
@@ -93,7 +93,12 @@ tags, and then generate with `hack/update-toc.sh`.
   - [Required changes for a WATCH request with the RV set to the last observed value (RV &gt; 0)](#required-changes-for-a-watch-request-with-the-rv-set-to-the-last-observed-value-rv--0)
   - [Provide a fix for the long-standing issue <a href="https://github.com/kubernetes/kubernetes/issues/59848">https://github.com/kubernetes/kubernetes/issues/59848</a>](#provide-a-fix-for-the-long-standing-issue-httpsgithubcomkuberneteskubernetesissues59848)
   - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
   - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
   - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
   - [Version Skew Strategy](#version-skew-strategy)
 - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
@@ -179,7 +184,7 @@ The kube-apiserver is vulnerable to memory explosion.
 The issue is apparent in larger clusters, where only a few LIST requests might cause serious disruption.
 Uncontrolled and unbounded memory consumption of the servers does not only affect clusters that operate in an 
 HA mode but also other programs that share the same machine.
-In this KEP we propose a potential solution to this issue.
+In this KEP we propose a solution to this issue.
 
 ## Motivation
 
@@ -257,7 +262,7 @@ The "Design Details" section below is for the real
 nitty-gritty.
 -->
 
-In order to lower memory consumption while getting a list of data and make it more predictable, we propose to use consistent streaming from the watch-cache instead of paging from etcd.
+In order to lower memory consumption while getting a list of data and make it more predictable, we propose to use streaming from the watch-cache instead of paging from etcd.
 Initially, the proposed changes will be applied to informers as they are usually the heaviest users of LIST requests (see [Appendix](#appendix) section for more details on how informers operate today).
 The primary idea is to use standard WATCH request mechanics for getting a stream of individual objects, but to use it for LISTs.
 This would allow us to keep memory allocations constant.
@@ -266,17 +271,17 @@ plus a few additional allocations, that will be explained later in this document
 The rough idea/plan is as follows:
 
 - step 1: change the informers to establish a WATCH request with a new query parameter instead of a LIST request.
-- step 2: upon receiving the request from an informer, contact etcd to get the latest RV. It will be used to make sure the watch cache has seen objects up to the received RV. This step is necessary and ensures we will serve consistent data, even from the cache.
-- step 2a: send all objects currently stored in memory for the given resource.
+- step 2: upon receiving the request from an informer, compute the RV at which the result should be returned (possibly contacting etcd if consistent read was requested). It will be used to make sure the watch cache has seen objects up to the received RV. This step is necessary and ensures we will meet the consistency requirements of the request.
+- step 2a: send all objects currently stored in memory for the given resource type.
 - step 2b: propagate any updates that might have happened meanwhile until the watch cache catches up to the latest RV received in step 2.
 - step 2c: send a bookmark event to the informer with the given RV.
 - step 3: listen for further events using the request from step 1.
 
 Note: the proposed watch-list semantics (without bookmark event and without the consistency guarantee) kube-apiserver follows already in RV="0" watches.
 The mode is not used in informers today but is supported by every kube-apiserver for legacy, compatibility reasons.
-A watch started with RV="0" may return stale. It is possible for the watch to start at a much older resource version that the client has previously observed, particularly in high availability configurations, due to partitions or stale caches
+A watch started with RV="0" may return stale data. It is possible for the watch to start at a much older resource version that the client has previously observed, particularly in high availability configurations, due to partitions or stale caches.
 
-Note 2: informers need consistent lists to avoid time-travel when switching to another HA instance of kube-apiserver with outdated/lagging watch cache.
+Note 2: informers need consistent lists to avoid time-travel when initializing after restart to avoid time travel in case of switching to another HA instance of kube-apiserver with outdated/lagging watch cache.
 See the following [issue](https://github.com/kubernetes/kubernetes/issues/59848) for more details.
 
 
@@ -310,7 +315,7 @@ required) or even code snippets. If there's any ambiguity about HOW your
 proposal will be implemented, this is the place to discuss them.
 -->
 
-### Required changes for a WATCH request with the RV="" and the ResourceVersionMatch=MostRecent
+### Required changes for a WATCH request with the SendInitialEvents=true
 
 The following sequence diagram depicts steps that are needed to complete the proposed feature.
 A high-level overview of each was provided in a table that follows immediately the diagram.
@@ -328,11 +333,11 @@ Whereas further down in this section we provided a detailed description of each
  </tr>
 <tr>
   <th>2.</th>
-  <th>The watch cache contacts etcd for the most up-to-date ResourceVersion.</th>
+  <th>If needed, the watch cache contacts etcd for the most up-to-date ResourceVersion.</th>
 </tr>
 <tr>
   <th>2a.</th>
-  <th>The watch cache starts streaming initial data. The data it already has in memory.</th>
+  <th>The watch cache starts streaming initial data it already has in memory.</th>
 </tr>
 <tr>
   <th>2b.</th>
@@ -352,14 +357,14 @@ Whereas further down in this section we provided a detailed description of each
 </tr>
 </table>
 
-Step 1: On initialization the reflector gets a snapshot of data from the server by passing RV=”” (= unset value) and setting resourceVersionMatch=MostRecent (= ensure freshness).
+Step 1: On initialization the reflector gets a snapshot of data from the server by passing RV=”” (= unset value) to ensure freshness and setting resourceVersionMatch=NotOlderThan and sendInitialEvents=true.
 We do that only during the initial ListAndWatch call.
 Each event (ADD, UPDATE, DELETE) except the BOOKMARK event received from the server is collected.
-Passing resourceVersionMatch=MostRecent tells the cacher it has to guarantee that the cache is at least up to date as a LIST executed at the same time.
+Passing resourceVersion="" tells the cacher it has to guarantee that the cache is at least up to date as a LIST executed at the same time.
 
 Note: This ensures that returned data is consistent, served from etcd via a quorum read and prevents "going back in time".
 
-Note 2: Unfortunately as of today, the watch cache is vulnerable to stale reads, see https://github.com/kubernetes/kubernetes/issues/59848 for more details.
+Note 2: Watch cache currently doesn't have the feature of supporting resourceVersion="" and thus is vulnerable to stale reads, see https://github.com/kubernetes/kubernetes/issues/59848 for more details.
 
 Step 2: Right after receiving a request from the reflector, the cacher gets the current resourceVersion (aka bookmarkAfterResourceVersion) directly from the etcd.
 It is used to make sure the cacher is up to date (has seen data stored in etcd) and to let the reflector know it has seen all initial data.
@@ -447,19 +452,52 @@ It replaces its internal store with the collected items (syncWith) and reuses th
 
 #### API changes
 
-Extend the optional `ResourceVersionMatch` query parameter of `ListOptions` with the following enumeration value:
+Extend the `ListOptions` struct with the following field:
 
 ```
-const (
-    // ResourceVersionMatchMostRecent matches data at the most recent ResourceVersion.
-    // The returned data is consistent, that is, served from etcd via a quorum read.
-    // For watch calls, it begins with synthetic "Added" events of all resources up to the most recent ResourceVersion.
-    // It ends with a synthetic "Bookmark" event containing the most recent ResourceVersion.
-    // For list calls, it has the same semantics as leaving ResourceVersion and ResourceVersionMatch unset.
-    ResourceVersionMatchMostRecent ResourceVersionMatch = "MostRecent"
-)
+type ListOptions struct {
+    ...
+
+    // SendInitialEvents, when set together with Watch option,
+    // begin the watch stream with synthetic init events to build the
+    // whole state of all resources followed by a synthetic "Bookmark"
+    // event containing a ResourceVersion after which the server
+    // continues streaming events.
+    //
+    // When SendInitialEvents option is set, we require ResourceVersionMatch
+    // option to also be set. The semantic of the watch request is as following:
+    // - ResourceVersionMatch = NotOlderThan
+    //   It starts with sending initial events for all objects (at some resource
+    //   version), potentially followed by an event stream until the state
+    //   becomes synced to a resource version as fresh as the one provided by
+    //   the ResourceVersion option. At this point, a synthetic bookmark event
+    //   is send and watch stream is continued to be send.
+    //   If RV is unset, this is interpreted as "consistent read" and the
+    //   bookmark event is send when the state is synced at least to the moment
+    //   when request started being processed.
+    // - ResourceVersionMatch = Exact
+    //   Unsupported error is returned.
+    // - ResourceVersionMatch unset (or set to any other value)
+    //   BadRequest error is returned.
+    //
+    // Defaults to true if ResourceVersion="" or ResourceVersion="0" (for backward
+    // compatibility reasons) and to false otherwise.
+    SendInitialEvents bool
+}
 ```
 
+The watch bookmark marking the end of initial events stream will have a dedicated
+annotation:
+```
+"k8s.io/initial-events-end": "true"
+```
+(the exact name is subject to change during API review). It will allow clients to
+precisely figure out when the initial stream of events is finished.
+
+It's worth noting that explicitly setting SendInitialEvents to false with ResourceVersion="0"
+will result in not sending initial events, which makes the option works exactly the same
+across every potential resource version passed as a parameter.
+
 #### Important optimisations
 
 1. Avoid DeepCopying of initial data<br><br>
@@ -542,19 +580,9 @@ Then on the server side we:
 4. otherwise, construct the final list and send back to a client.
 
 ### Test Plan
-- unit tests for new code added to the watch cache were implemented.
-- unit tests for new code added to the reflector were implemented.
-- integration tests asserting fallback mechanism for reflector were added.
 <!--
 **Note:** *Not required until targeted at a release.*
-
-Consider the following in developing a test plan for this enhancement:
-- Will there be e2e and integration tests, in addition to unit tests?
-- How will it be tested in isolation vs with other components?
-
-No need to outline all of the test cases, just the general strategy. Anything
-that would count as tricky in the implementation, and anything particularly
-challenging to test, should be called out.
+The goal is to ensure that we don't accept enhancements with inadequate testing.
 
 All code is expected to have adequate tests (eventually with coverage
 expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
@@ -563,8 +591,69 @@ when drafting this test plan.
 [testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
 -->
 
+[X] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+##### Unit tests
+<!--
+In principle every added code should have complete unit test coverage, so providing
+the exact set of tests will not bring additional value.
+However, if complete unit test coverage is not possible, explain the reason of it
+together with explanation why this is acceptable.
+-->
+
+<!--
+Additionally, for Alpha try to enumerate the core package you will be touching
+to implement this enhancement and provide the current unit coverage for those
+in the form of:
+- <package>: <date> - <current test coverage>
+The data can be easily read from:
+https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit
+
+This can inform certain test coverage improvements that we want to do before
+extending the production code to implement this enhancement.
+-->
+- k8s.io/apiserver/pkg/storage/cacher: 02/02/2023 - 74,7%
+- k8s.io/client-go/tools/cache/reflector: 02/02/2023 - 88,6%
+
+##### Integration tests
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+-->
+- For alpha, tests asserting fallback mechanism for reflector will be added.
+
+##### e2e tests
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+-->
+- For alpha, tests exercising this feature will be added.
+
 ### Graduation Criteria
 
+#### Alpha
+
+- The Feature is implemented behind `ConsistentWatchList` feature flag
+- Initial e2e tests completed and enabled
+- Scalability/Performance tests confirm gains of this feature
+
 <!--
 **Note:** *Not required until targeted at a release.*
 
@@ -945,6 +1034,18 @@ This through this both in small and large cases, again with respect to the
 [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
 -->
 
+###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
+<!--
+Focus not just on happy cases, but primarily on more pathological cases
+(e.g. probes taking a minute instead of milliseconds, failed pods consuming resources, etc.).
+If any of the resources can be exhausted, how this is mitigated with the existing limits
+(e.g. pods per node) or new limits added by this KEP?
+
+Are there any tests that were run/should be run to understand performance characteristics better
+and validate the declared limits?
+-->
+On the contrary. It will decrease the memory usage required for master nodes.
+
 ### Troubleshooting
 
 <!--
diff --git a/keps/sig-api-machinery/3157-watch-list/kep.yaml b/keps/sig-api-machinery/3157-watch-list/kep.yaml
index f594261777b3..3d21f5a9ce38 100644
--- a/keps/sig-api-machinery/3157-watch-list/kep.yaml
+++ b/keps/sig-api-machinery/3157-watch-list/kep.yaml
@@ -20,11 +20,11 @@ stage: alpha
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.26"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
-  alpha: "v1.26"
+  alpha: "v1.27"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
diff --git a/keps/sig-api-machinery/3352-aggregated-discovery/README.md b/keps/sig-api-machinery/3352-aggregated-discovery/README.md
index b7712b3a2c83..f479827e5ba4 100644
--- a/keps/sig-api-machinery/3352-aggregated-discovery/README.md
+++ b/keps/sig-api-machinery/3352-aggregated-discovery/README.md
@@ -616,6 +616,7 @@ main focus will be on kubectl and golang clients.
 #### Beta
 
 - kubectl uses the aggregated discovery feature by default
+- Metrics are added
 
 #### GA
 
@@ -678,7 +679,12 @@ channel if you need any help or guidance. -->
 
 ###### Does enabling the feature change any default behavior?
 
-No
+Clients using client-go version 1.26 and up will use the aggregated
+discovery endpoint rather than the unaggregated discovery endpoint.
+This is handled automatically in client-go and clients should see less
+requests to the api server when fetching discovery information. Client
+versions older than 1.26 will continue to use the old unaggregated
+discovery endpoint without any changes.
 
 ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
 
@@ -690,7 +696,13 @@ and restarting the component. No other changes should be necessary to
 disable the feature.
 
 NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
---> Yes, the feature may be disabled by reverting the feature flag.
+-->
+
+Yes, the feature may be disabled on the apiserver by reverting the
+feature flag. This will disable aggregated discovery for all clients. If there is a golang specific client side bug, the feature may also be
+turned off in client-go via the
+[WithLegacy()](https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/client-go/discovery/discovery_client.go#L80)
+toggle and this will require a recompile of the application.
 
 ###### What happens if we reenable the feature if it was previously rolled back?
 
@@ -731,6 +743,18 @@ feature flags will be enabled on some API servers and not others
 during the rollout. Similarly, consider large clusters and how
 enablement/disablement will rollout across nodes. -->
 
+During a rollout, some apiservers may support aggregated discovery and
+some may not. It is recommended that clients request for both the
+aggregated discovery document with a fallback to the unaggregated
+discovery format. This can be achieved by setting the Accept header to
+have a fallback to the default GVK of the `/apis` and `/api` endpoint.
+For example, to request the aggregated discovery type and fallback to
+the unaggregated discovery, the following header can be sent: `Accept:
+application/json;as=APIGroupDiscoveryList;v=v2beta1;g=apidiscovery.k8s.io,application/json`
+
+This kind of fallback is already implemented in client-go and this
+note is intended for non-golang clients.
+
 ###### What specific metrics should inform a rollback?
 
 <!-- What signals should users be paying attention to when the feature
@@ -748,11 +772,20 @@ term, we may want to require automated upgrade/rollback tests, but we
 are missing a bunch of machinery and tooling and can't do that now.
 -->
 
+n/a. The API introduced does not store data and state is recalculated on the upgrade, downgrade, upgrade cycle. No state is preserved between versions.
+
 ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
 
 <!-- Even if applying deprecation policies, they may still surprise
 some users. -->
 
+By enabling aggregated discovery as the default, the new API is
+slightly different from the unaggregated version. The
+StorageVersionHash field is removed from resources in the aggregated
+discovery API. The storage version migrator will have an additional
+flag when initializing the discovery client to continue using the
+unaggregated API.
+
 ### Monitoring Requirements
 
 <!-- This section must be completed when targeting beta to a release.
@@ -766,6 +799,17 @@ the previous answers based on experience in the field. -->
 Kubernetes API (e.g., checking if there are objects with field X set)
 may be a last resort. Avoid logs or events for this purpose. -->
 
+Operators can check whether an aggregated discovery request can be
+made by sending a request to `apis` with
+`application/json;as=APIGroupDiscoveryList;v=v2beta1;g=apidiscovery.k8s.io,application/json`
+as the Accept header and looking at the the `Content-Type` response
+header. A Content Type response header of `Content-Type:
+application/json;g=apidiscovery.k8s.io;v=v2beta1;as=APIGroupDiscoveryList`
+indicates that aggregated discovery is supported and a `Content-Type:
+application/json` header indicates that aggregated discovery is not
+supported. They can also check for the presence of aggregated
+discovery related metrics: `aggregated_discovery_aggregation_count`
+
 ###### How can someone using this feature know that it is working for their instance?
 
 <!-- For instance, if this is a pod-related feature, it should be
@@ -802,14 +846,18 @@ the next question. -->
 - [x] Metrics
   - Metric name: `aggregator_discovery_aggregation_duration`
   - Components exposing the metric: `kube-server`
-  - This is a metric for exposing the time it took to aggregate all the
+  - This is a metric for exposing the time it took to aggregate all the api resources.
+
+  - Metric name: `aggregator_discovery_aggregation_count`
+  - Components exposing the metric: `kube-server`
+  - This is a metric for the number of times that the discovery document has been aggregated.
 
 ###### Are there any missing metrics that would be useful to have to improve observability of this feature?
 
 <!-- Describe the metrics themselves and the reasons why they weren't
 added (e.g., cost, implementation difficulties, etc.). -->
 
-Yes. A metric for the regeneration count of the discovery document. `aggregator_discovery_aggregation_count`
+No.
 
 ### Dependencies
 
@@ -833,6 +881,12 @@ cluster-level services (e.g. DNS):
       - Impact of its degraded performance or high-error rates on the
 feature: -->
 
+No, but if aggregated apiservers are present, the feature will attempt
+to contact and aggregate the data published from the aggregated
+apiserver on a set interval. If there is high error rate, stale data
+may be returned because the latest data was not able to be fetched
+from the aggregated apiserver.
+
 ### Scalability
 
 <!-- For alpha, this section is encouraged: reviewers should consider
@@ -858,6 +912,12 @@ Focusing mostly on:
   - periodic API calls to reconcile state (e.g. periodic fetching
     state, heartbeats, leader election, etc.) -->
 
+No. Enabling this feature should reduce the total number of API calls
+for client discovery. Instead of clients sending a discovery request
+to all group versions (`/apis/<group>/<version>`), they will only need
+to send a request to the aggregated endpoint to obtain all resources
+that the cluster supports.
+
 ###### Will enabling / using this feature result in introducing new API types?
 
 <!-- Describe them, providing:
@@ -866,12 +926,16 @@ Focusing mostly on:
   - Supported number of objects per namespace (for namespace-scoped
 objects) -->
 
+Yes, but these API types are not persisted.
+
 ###### Will enabling / using this feature result in any new calls to the cloud provider?
 
 <!-- Describe them, providing:
   - Which API(s):
   - Estimated increase: -->
 
+No.
+
 ###### Will enabling / using this feature result in increasing size or count of the existing API objects?
 
 <!-- Describe them, providing:
@@ -880,6 +944,8 @@ objects) -->
   - Estimated amount of new objects: (e.g., new Object X for every
 existing Pod) -->
 
+No.
+
 ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
 
 <!-- Look at the [existing SLIs/SLOs].
@@ -890,6 +956,8 @@ details.
 
 [existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos -->
 
+No.
+
 ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
 
 <!-- Things to keep in mind include: additional in-memory state,
@@ -900,6 +968,8 @@ large cases, again with respect to the [supported limits].
 
 [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md -->
 
+No.
+
 ### Troubleshooting
 
 <!-- This section must be completed when targeting beta to a release.
@@ -914,6 +984,8 @@ may consider splitting it into a dedicated `Playbook` document
 
 ###### How does this feature react if the API server and/or etcd is unavailable?
 
+The feature is built into the API server, and will not work if the API server is unavailable.
+
 ###### What are other known failure modes?
 
 <!-- For each of them, fill in the following information by copying
@@ -932,6 +1004,8 @@ why. -->
 
 ###### What steps should be taken if SLOs are not being met to determine the problem?
 
+The feature can be rolled back by setting the AggregatedDiscoveryEndpoint feature flag to false.
+
 ## Implementation History
 
 <!-- Major milestones in the lifecycle of a KEP should be tracked in
diff --git a/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml b/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml
index 3040f71bc070..bf7d0b3dcbf3 100644
--- a/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml
+++ b/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml
@@ -18,16 +18,17 @@ approvers:
   - "@lavalamp"
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: alpha
+stage: beta
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.26"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.26"
+  beta: "v1.27"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
@@ -38,5 +39,6 @@ feature-gates:
 disable-supported: true
 
 # The following PRR answers are required at beta release
-# metrics:
-# - my_feature_metric
+metrics:
+- aggregator_discovery_aggregation_duration
+- aggregator_discovery_aggregation_count
diff --git a/keps/sig-api-machinery/3488-cel-admission-control/README.md b/keps/sig-api-machinery/3488-cel-admission-control/README.md
index 9937ffddf539..d24b902e6aaa 100644
--- a/keps/sig-api-machinery/3488-cel-admission-control/README.md
+++ b/keps/sig-api-machinery/3488-cel-admission-control/README.md
@@ -407,7 +407,7 @@ spec:
   validations:
     - name: max-replicas
       expression: "object.spec.replicas <= params.maxReplicas"
-      messageExpression: "'object.spec.replicas must be no greater than ' + string(params.maxReplicas)"
+      messageExpression: "'object.spec.replicas must be no greater than %d'.format([params.maxReplicas])"
       reason: Invalid
       # ...other rule related fields here...
 ```
@@ -850,6 +850,11 @@ Policy definitions:
 - Each validation may define a message:
   - `message` - plain string message
   - `messageExpression: "<cel expression>"` (mutually exclusive with `message`)
+    - As part of [the KEP update to add expression composition](https://github.com/kubernetes/enhancements/pull/3669/files),
+      expressions defined under `variables` will be accessible from `messageExpression`
+    - `messageExpression` is a CEL expression and thus factors into the runtime cost limit.
+    If the runtime cost limit is exceeded during `messageExpression` execution, then this is logged.
+    Whether or not the action is admitted after that depends upon failure policy.
   - If `message` and `messageExpression` are absent, `expression` and `name`
     will be included in the failure message
   - If `messageExpression` results in an error: `expression` and `name` will be
@@ -871,7 +876,7 @@ spec:
   validations:
     - expression: "self.name.startsWith('xyz-')"
       name: name-prefix
-      messageExpression: "self.name + ' must start with xyz-'"
+      message: "self.name must start with xyz-"
       reason: Unauthorized
     - expression: "self.name.contains('bad')"
       name: bad-name
@@ -880,7 +885,7 @@ spec:
       reason: Invalid
     - expression: "self.name.contains('suspicious')"
       name: suspicious-name
-      messageExpression: "self.name + ' contains suspicious'"
+      message: "'self.name contains suspicious'"
       code: 400
       reason: Invalid
 ```
@@ -907,7 +912,7 @@ Problem examples:
 | CRD is in multiple clusters, but schema differs        | If policy author is aware of the schema variations, can they write policies that work for all the variations? |
 | Validation of an aggregated API server type            | Main API server does not have type definitions |
 
-Due to these complications, we have decided to evalute CEL expressions
+Due to these complications, we have decided to evaluate CEL expressions
 dynamically. Informational type checking will be provided (except for aggregated
 API server types), but will be surfaced only as warnings. See "Alternatives
 Considered" section for details of all the alternatives we reviewed when
@@ -1165,7 +1170,7 @@ Constraints](https://github.com/kubernetes/enhancements/tree/master/keps/sig-api
 
 - CEL estimated cost limits
 - CEL runtime cost limits
-- Go context cancelation as a way of halting CEL execution if the request
+- Go context cancellation as a way of halting CEL execution if the request
   context is canceled for any reason.
 
  Estimated cost is, unfortunately, not something we can offer for admission with
@@ -1174,11 +1179,11 @@ Constraints](https://github.com/kubernetes/enhancements/tree/master/keps/sig-api
  for the same cases where we provide informational type checking, in which case
  we can report any cost limit violations in the same way we report type checking
  violations. Note that for built-in types, where `max{Length,Items,Properties}`
- value valiations are not available, estimated cost calculations will not be
- nearly as helpful or actionable. I recommend we do not attempt any estimated
- cost calculations on built-in types until the value validations are available.
+ value validations are not available, estimated cost calculations will not be
+ nearly as helpful or actionable. We do not plan to enforce any estimated
+ cost calculations on ValidatingAdmissionPolicy.
 
- Runtime cost limits can should be established and enforced. Exceeding the cost
+ Runtime cost limits should be established and enforced. Exceeding the cost
  limit will trigger the `FailurePolicy`, so this will need to be documented, but
  unlike webhooks, runtime cost is deterministic (it is purely a function of the
  input data and the CEL expression and is independent of underlying hardware or
@@ -1186,7 +1191,7 @@ Constraints](https://github.com/kubernetes/enhancements/tree/master/keps/sig-api
  webhook timeouts.
 
  The request's Go context will be passed in to all CEL evaluations such that
- cancelation halts CEL evaluation, if, for any reason, the context is canceled.
+ cancellation halts CEL evaluation, if, for any reason, the context is canceled.
 
 #### Safety Features
 
@@ -1223,7 +1228,10 @@ Plan:
 To consider:
 
 - labelSelector evaluation functions or other match evaluator functions ([original comment thread](https://github.com/kubernetes/enhancements/pull/3492#discussion_r981747317))
-- `string.format(string, list(dyn))` to make `messageExpression` more convenient.
+
+To implement:
+
+- `string.format` into CEL upstream ([tracking PR](https://github.com/google/cel-go/pull/617)) (TODO @DangerOnTheRanger: add tracking cel-go issue once available)
 
 #### Audit Annotations
 
@@ -2872,7 +2880,7 @@ For example, to validate all containers:
   validations:
     - scope: "spec.containers[*]"
       expression: "scope.name.startsWith('xyz-')"
-      messageExpression: "scope.name + 'does not start with \'xyz\''"
+      message: "scope.name does not start with 'xyz'"
 ```
 
 To make it possible to access the path information in the scope, we can offer a
@@ -2886,7 +2894,7 @@ spec.x[xKey].y[yIndex].field
   validations:
     - scope: "x[xKey].y[yIndex].field"
       expression: "scope.startsWith('xyz-')"
-      messageExpression: "scopePath.xKey + ', ' + scopePath.yIndex + ': some problem'"
+      messageExpression: "'%s, %d: some problem'.format([scopePath.xKey, scopePath.yIndex])"
 ```
 
 Prior art:
@@ -2907,7 +2915,7 @@ Note: We considered extending to a list of scopes, e.g.:
   validations:
     - scopes: ["spec.containers[*]", "initContainers[*]", "spec.ephemeralContainers[*]"]
       expression: "scope.name.startsWith('xyz-')"
-      messageExpression: "scope.name + ' does not start with \'xyz\''"
+      message: "scope.name does not start with 'xyz'"
 ```
 
 But feedback was this is signficantly more difficult to understand.
diff --git a/keps/sig-apps/1847-autoremove-statefulset-pvcs/README.md b/keps/sig-apps/1847-autoremove-statefulset-pvcs/README.md
index 58408daa1eb6..9ed4e207dfd8 100644
--- a/keps/sig-apps/1847-autoremove-statefulset-pvcs/README.md
+++ b/keps/sig-apps/1847-autoremove-statefulset-pvcs/README.md
@@ -561,7 +561,7 @@ stateful set controller lives) should be examined and/or restarted.
 
   - 1.21, KEP created.
   - 1.23, alpha implementation.
-  - 1.26, graduation to beta.
+  - 1.27, graduation to beta.
 
 ## Drawbacks
 The StatefulSet field update is required.
diff --git a/keps/sig-apps/1847-autoremove-statefulset-pvcs/kep.yaml b/keps/sig-apps/1847-autoremove-statefulset-pvcs/kep.yaml
index 8fb761330e68..1c6ab06ff42c 100644
--- a/keps/sig-apps/1847-autoremove-statefulset-pvcs/kep.yaml
+++ b/keps/sig-apps/1847-autoremove-statefulset-pvcs/kep.yaml
@@ -20,12 +20,12 @@ approvers:
 
 stage: beta
 
-latest-milestone: "v1.26"
+latest-milestone: "v1.27"
 
 milestone:
   alpha: "v1.23"
-  beta: "v1.26"
-  stable: "v1.27"
+  beta: "v1.27"
+  stable: "v1.28"
 
 feature-gates:
   - name: StatefulSetAutoDeletePVC
diff --git a/keps/sig-apps/3140-TimeZone-support-in-CronJob/README.md b/keps/sig-apps/3140-TimeZone-support-in-CronJob/README.md
index 0fd7ea817ee4..e36e532e49b6 100644
--- a/keps/sig-apps/3140-TimeZone-support-in-CronJob/README.md
+++ b/keps/sig-apps/3140-TimeZone-support-in-CronJob/README.md
@@ -90,10 +90,6 @@ fully GA-ed it's time to satisfy the original request.
 
 ### Non-Goals
 
-<!--
-What is out of scope for this KEP? Listing non-goals helps to focus discussion
-and make progress.
--->
 
 ## Proposal
 
@@ -118,6 +114,10 @@ This problem can be mitigated with a fresh build of kube-controller-manager with
 updated golang version, but it heavily relies on the go community to keep the
 time zone database up-to-date.
 
+- Outdated time zone database on the system where kube-controller-manager is running
+
+Cluster administrator should always ensure their systems are up-to-date.
+
 - Malicious user can create multiple CronJobs with different time zone which can
 actually trigger Jobs at the exact same time
 
@@ -175,9 +175,9 @@ to implement this enhancement.
 
 ##### Unit tests
 
-- `k8s.io/kubernetes/pkg/apis/batch/validation`: `2022-06-09` - `94.4%`
-- `k8s.io/kubernetes/pkg/controller/cronjob`: `2022-06-09` - `50.8%`
-- `k8s.io/kubernetes/pkg/registry/batch/cronjob`: `2022-06-09` - `61.8%`
+- `k8s.io/kubernetes/pkg/apis/batch/validation`: `2023-01-18` - `96.0%`
+- `k8s.io/kubernetes/pkg/controller/cronjob`: `2023-01-18` - `51.7%`
+- `k8s.io/kubernetes/pkg/registry/batch/cronjob`: `2023-01-18` - `56.3%`
 
 ##### Integration tests
 
@@ -185,7 +185,7 @@ None.
 
 ##### e2e tests
 
-None.
+- [CronJob should support timezone](https://github.com/kubernetes/kubernetes/blob/db4c64b1a987675fff7a234e8633f4fd01c69530/test/e2e/apps/cronjob.go#L301): https://storage.googleapis.com/k8s-triage/index.html?sig=apps&test=should%20support%20timezone
 
 ### Graduation Criteria
 
@@ -203,38 +203,7 @@ None.
 
 #### GA
 
-TBD
-
-<!---
-#### Beta
-
-- Gather feedback from developers and surveys
-- Complete features A, B, C
-- Additional tests are in Testgrid and linked in KEP
-
-#### GA
-
-- N examples of real-world usage
-- N installs
-- More rigorous forms of testing—e.g., downgrade tests and scalability tests
-- Allowing time for feedback
-
-**Note:** Generally we also wait at least two releases between beta and
-GA/stable, because there's no opportunity for user feedback, or even bug reports,
-in back-to-back releases.
-
-**For non-optional features moving to GA, the graduation criteria must include
-[conformance tests].**
-
-[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
-
-#### Deprecation
-
-- Announce deprecation and support policy of the existing flag
-- Two versions passed since introducing the functionality that deprecates the flag (to address version skew)
-- Address feedback on usage/changed behavior, provided on GitHub issues
-- Deprecate the flag
--->
+- Add a new condition when a cronjob controller encounters an invalid time zone.
 
 ### Upgrade / Downgrade Strategy
 
@@ -429,6 +398,7 @@ lies.
 
 - *2022-01-14* - Initial KEP draft
 - *2022-06-09* - Updated KEP for beta promotion.
+- *2023-01-18* - Updated KEP for stable promotion.
 
 ## Drawbacks
 
diff --git a/keps/sig-apps/3140-TimeZone-support-in-CronJob/kep.yaml b/keps/sig-apps/3140-TimeZone-support-in-CronJob/kep.yaml
index 9db242e20fdf..90d29535f9f8 100644
--- a/keps/sig-apps/3140-TimeZone-support-in-CronJob/kep.yaml
+++ b/keps/sig-apps/3140-TimeZone-support-in-CronJob/kep.yaml
@@ -18,12 +18,12 @@ see-also:
 replaces:
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: beta
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.25"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
diff --git a/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md b/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md
index 719d620a2b46..87a40386d145 100644
--- a/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md
+++ b/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md
@@ -230,7 +230,7 @@ thousands of nodes requires usage of pod restart policies in order
 to account for infrastructure failures.
 
 Currently, kubernetes Job API offers a way to account for infrastructure
-failures by setting `.backoffLimit > 0`. However, this mechanism intructs the
+failures by setting `.backoffLimit > 0`. However, this mechanism instructs the
 job controller to restart all failed pods - regardless of the root cause
 of the failures. Thus, in some scenarios this leads to unnecessary
 restarts of many pods, resulting in a waste of time and computational
@@ -354,7 +354,7 @@ As a machine learning researcher, I run jobs comprising thousands
 of long-running pods on a cluster comprising thousands of nodes. The jobs often
 run at night or over weekend without any human monitoring. In order to account
 for random infrastructure failures we define `.backoffLimit: 6` for the job.
-However, a signifficant portion of the failures happen due to bugs in code.
+However, a significant portion of the failures happen due to bugs in code.
 Moreover, the failures may happen late during the program execution time. In
 such case, restarting such a pod results in wasting a lot of computational time.
 
@@ -729,7 +729,7 @@ in different messages for pods.
 - Reproduction: Run kube-controller-manager with disabled taint-manager (with the
   flag `--enable-taint-manager=false`). Then, run a job with a long-running pod and
   disconnect the node
-- Comments: handled by node lifcycle controller in: `controller/nodelifecycle/node_lifecycle_controller.go`.
+- Comments: handled by node lifecycle controller in: `controller/nodelifecycle/node_lifecycle_controller.go`.
   However, the pod phase remains `Running`.
 - Pod status:
   - status: Unknown
@@ -762,7 +762,7 @@ In Alpha, there is no support for Pod conditions for failures or disruptions ini
 
 For Beta we introduce handling of Pod failures initiated by Kubelet by adding
 the pod disruption condition (introduced in Alpha) in case of disruptions
-initiated by kubetlet (see [Design details](#design-details)).
+initiated by Kubelet (see [Design details](#design-details)).
 
 Kubelet can also evict a pod in some scenarios which are not covered with
 adding a pod failure condition:
@@ -863,7 +863,7 @@ dies) between appending a pod condition and deleting the pod.
 In particular, scheduler can possibly decide to preempt
 a different pod the next time (or none). This would leave a pod with a
 condition that it was preempted, when it actually wasn't. This in turn
-could lead to inproper handling of the pod by the job controller.
+could lead to improper handling of the pod by the job controller.
 
 As a solution we implement a worker, part of the disruption
 controller, which clears the pod condition added if `DeletionTimestamp` is
@@ -1218,7 +1218,7 @@ the pod failure does not match any of the specified rules, then default
 handling of failed pods applies.
 
 If we limit this feature to use `onExitCodes` only when `restartPolicy=Never`
-(see: [limitting this feature](#limitting-this-feature)), then the rules using
+(see: [limiting this feature](#limitting-this-feature)), then the rules using
 `onExitCodes` are evaluated only against the exit codes in the `state` field
 (under `terminated.exitCode`) of `pod.status.containerStatuses` and
 `pod.status.initContainerStatuses`. We may also need to check for the exit codes
@@ -1279,9 +1279,9 @@ the following scenarios will be covered with unit tests:
 - handling of a pod failure, in accordance with the specified `spec.podFailurePolicy`,
   when the failure is associated with
   - a failed container with non-zero exit code,
-  - a dedicated Pod condition indicating termmination originated by a kubernetes component
+  - a dedicated Pod condition indicating termination originated by a kubernetes component
 - adding of the `DisruptionTarget` by Kubelet in case of:
-  - eviciton due to graceful node shutdown
+  - eviction due to graceful node shutdown
   - eviction due to node pressure
 <!--
 Additionally, for Alpha try to enumerate the core package you will be touching
@@ -1313,7 +1313,7 @@ The following scenarios will be covered with integration tests:
 - pod failure is caused by a failed container with a non-zero exit code
 
 More integration tests might be added to ensure good code coverage based on the
-actual implemention.
+actual implementation.
 
 <!--
 This question should be filled when targeting a release.
@@ -1338,8 +1338,22 @@ We expect no non-infra related flakes in the last month as a GA graduation crite
 - <test>: <link to test coverage
 
 -->
-The following scenario will be covered with e2e tests:
-- early job termination when a container fails with a non-retriable exit code
+The following scenario are covered with e2e tests:
+- [sig-apps#gce](https://testgrid.k8s.io/sig-apps#gce):
+  - Job Using a pod failure policy to not count some failures towards the backoffLimit Ignore DisruptionTarget condition
+  - Job Using a pod failure policy to not count some failures towards the backoffLimit Ignore exit code 137
+  - Job should allow to use the pod failure policy on exit code to fail the job early
+  - Job should allow to use the pod failure policy to not count the failure towards the backoffLimit
+- [sig-scheduling#gce-serial](https://testgrid.k8s.io/sig-scheduling#gce-serial):
+  - SchedulerPreemption [Serial] validates pod disruption condition is added to the preempted pod
+- [sig-release-master-informing#gce-cos-master-serial](https://testgrid.k8s.io/sig-release-master-informing#gce-cos-master-serial):
+  - NoExecuteTaintManager Single Pod [Serial] pods evicted from tainted nodes have pod disruption condition
+
+The following scenarios are covered with node e2e tests
+([sig-node-presubmits#pr-kubelet-gce-e2e-pod-disruption-conditions](https://testgrid.k8s.io/sig-node-presubmits#pr-kubelet-gce-e2e-pod-disruption-conditions) and
+[sig-node-presubmits#pr-node-kubelet-serial-containerd](https://testgrid.k8s.io/sig-node-presubmits#pr-node-kubelet-serial-containerd)):
+  - GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown] [NodeFeature:GracefulNodeShutdownBasedOnPodPriority] graceful node shutdown when PodDisruptionConditions are enabled [NodeFeature:PodDisruptionConditions] should add the DisruptionTarget pod failure condition to the evicted pods
+  - PriorityPidEvictionOrdering [Slow] [Serial] [Disruptive][NodeFeature:Eviction] when we run containers that should cause PIDPressure; PodDisruptionConditions enabled [NodeFeature:PodDisruptionConditions] should eventually evict all of the correct pods
 
 More e2e test scenarios might be considered during implementation if practical.
 
@@ -1439,7 +1453,7 @@ N/A
 An upgrade to a version which supports this feature should not require any
 additional configuration changes. In order to use this feature after an upgrade
 users will need to configure their Jobs by specifying `spec.podFailurePolicy`. The
-only noticeable difference in behaviour, without specifying `spec.podFailurePolicy`,
+only noticeable difference in behavior, without specifying `spec.podFailurePolicy`,
 is that Pods terminated by kubernetes components will have an additional
 condition appended to `status.conditions`.
 
@@ -1654,7 +1668,7 @@ Manual test performed to simulate the upgrade->downgrade->upgrade scenario:
   - Scenario 2:
     - Create a job with a long running containers and `backoffLimit=0`.
     - Verify that the job continues after the node in uncordoned
-1. Disable the feature gates. Verify that the above scenarios result in default behaviour:
+1. Disable the feature gates. Verify that the above scenarios result in default behavior:
   - In scenario 1: the job restarts pods failed with exit code `42`
   - In scenario 2: the job is failed due to exceeding the `backoffLimit` as the failed pod failed during the draining
 1. Re-enable the feature gates
@@ -1953,7 +1967,7 @@ technics apply):
   is an increase of the Job controller processing time.
 - Inspect the Job controller's `job_pods_finished_total` metric for the
   to check if the numbers of pod failures handled by specific actions (counted
-  by the `failure_policy_action` label) agree with the expetations.
+  by the `failure_policy_action` label) agree with the expectations.
   For example, if a user configures job failure policy with `Ignore` action for
   the `DisruptionTarget` condition, then a node drain is expected to increase
   the metric for `failure_policy_action=Ignore`.
@@ -1963,7 +1977,7 @@ technics apply):
 
 - 2022-06-23: Initial KEP merged
 - 2022-07-12: Preparatory PR "Refactor gc_controller to do not use the deletePod stub" merged
-- 2022-07-14: Preparatory PR "efactor taint_manager to do not use getPod and getNode stubs" merged
+- 2022-07-14: Preparatory PR "Refactor taint_manager to do not use getPod and getNode stubs" merged
 - 2022-07-20: Preparatory PR "Add integration test for podgc" merged
 - 2022-07-28: KEP updates merged
 - 2022-08-01: Additional KEP updates merged
@@ -1972,7 +1986,7 @@ technics apply):
 - 2022-08-04: PR "Support handling of pod failures with respect to the configured rules" merged
 - 2022-09-09: Bugfix PR for test "Fix the TestRoundTripTypes by adding default to the fuzzer" merged
 - 2022-09-26: Prepared PR for KEP Beta update. Summary of the changes:
-  - propsal to extend kubelet to add the following pod conditions when evicting a pod (see [Design details](#design-details)):
+  - proposal to extend kubelet to add the following pod conditions when evicting a pod (see [Design details](#design-details)):
     - DisruptionTarget for evictions due graceful node shutdown, admission errors, node pressure or Pod admission errors
     - ResourceExhausted for evictions due to OOM killer and exceeding Pod's ephemeral-storage limits
   - extended the review of pod eviction scenarios by kubelet-initiated pod evictions:
diff --git a/keps/sig-apps/3715-elastic-indexed-job/README.md b/keps/sig-apps/3715-elastic-indexed-job/README.md
new file mode 100644
index 000000000000..cc51802e97c2
--- /dev/null
+++ b/keps/sig-apps/3715-elastic-indexed-job/README.md
@@ -0,0 +1,954 @@
+<!--
+**Note:** When your KEP is complete, all of these comment blocks should be removed.
+
+To get started with this template:
+
+- [ ] **Pick a hosting SIG.**
+  Make sure that the problem space is something the SIG is interested in taking
+  up. KEPs should not be checked in without a sponsoring SIG.
+- [ ] **Create an issue in kubernetes/enhancements**
+  When filing an enhancement tracking issue, please make sure to complete all
+  fields in that template. One of the fields asks for a link to the KEP. You
+  can leave that blank until this KEP is filed, and then go back to the
+  enhancement and add the link.
+- [ ] **Make a copy of this template directory.**
+  Copy this template into the owning SIG's directory and name it
+  `NNNN-short-descriptive-title`, where `NNNN` is the issue number (with no
+  leading-zero padding) assigned to your enhancement above.
+- [ ] **Fill out as much of the kep.yaml file as you can.**
+  At minimum, you should fill in the "Title", "Authors", "Owning-sig",
+  "Status", and date-related fields.
+- [ ] **Fill out this file as best you can.**
+  At minimum, you should fill in the "Summary" and "Motivation" sections.
+  These should be easy if you've preflighted the idea of the KEP with the
+  appropriate SIG(s).
+- [ ] **Create a PR for this KEP.**
+  Assign it to people in the SIG who are sponsoring this process.
+- [ ] **Merge early and iterate.**
+  Avoid getting hung up on specific details and instead aim to get the goals of
+  the KEP clarified and merged quickly. The best way to do this is to just
+  start with the high-level sections and fill out details incrementally in
+  subsequent PRs.
+
+Just because a KEP is merged does not mean it is complete or approved. Any KEP
+marked as `provisional` is a working document and subject to change. You can
+denote sections that are under active debate as follows:
+
+```
+<<[UNRESOLVED optional short context or usernames ]>>
+Stuff that is being argued.
+<<[/UNRESOLVED]>>
+```
+
+When editing KEPS, aim for tightly-scoped, single-topic PRs to keep discussions
+focused. If you disagree with what is already in a document, open a new PR
+with suggested changes.
+
+One KEP corresponds to one "feature" or "enhancement" for its whole lifecycle.
+You do not need a new KEP to move from beta to GA, for example. If
+new details emerge that belong in the KEP, edit the KEP. Once a feature has become
+"implemented", major changes should get new KEPs.
+
+The canonical place for the latest set of instructions (and the likely source
+of this file) is [here](/keps/NNNN-kep-template/README.md).
+
+**Note:** Any PRs to move a KEP to `implementable`, or significant changes once
+it is marked `implementable`, must be approved by each of the KEP approvers.
+If none of those approvers are still appropriate, then changes to that list
+should be approved by the remaining approvers and/or the owning SIG (or
+SIG Architecture for cross-cutting KEPs).
+-->
+# KEP-3715: Elastic Indexed Job
+
+<!--
+This is the title of your KEP. Keep it short, simple, and descriptive. A good
+title can help communicate what the KEP is and should be considered as part of
+any review.
+-->
+
+<!--
+A table of contents is helpful for quickly jumping to sections of a KEP and for
+highlighting any additional information provided beyond the standard KEP
+template.
+
+Ensure the TOC is wrapped with
+  <code>&lt;!-- toc --&rt;&lt;!-- /toc --&rt;</code>
+tags, and then generate with `hack/update-toc.sh`.
+-->
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [User Stories (Optional)](#user-stories-optional)
+    - [Story 1](#story-1)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Beta](#beta)
+    - [GA](#ga)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Alternatives](#alternatives)
+  - [Use StatefulSets](#use-statefulsets)
+  - [Make spec.completions fully mutable](#make-speccompletions-fully-mutable)
+  - [Allow spec.completions=nil for Indexed mode](#allow-speccompletionsnil-for-indexed-mode)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [x] (R) Design details are appropriately documented
+- [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [x] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+Currently `spec.completions` is an immutable field in Job for both `Indexed` and `NonIndexed` modes. 
+This KEP proposes to allow mutating `spec.completions` for `Indexed` job iff `spec.completions` equals
+to `spec.parallelism` before and after the update, meaning that `spec.completions` is mutable only 
+in tandem with `spec.parallelism`.
+
+## Motivation
+
+There are cases where a batch workload requires an autoscaled indexed job with stable 
+DNS pod names, for example MPI Horovord, Ray, PyTorch etc. This is currently not possible 
+because `spec.completions`, which controls the range of indexes in `Indexed` job, is immutable. 
+While such workloads could be modeled as a StatefulSet, the Job API is a better fit because it
+offers batch-specific features such as allowing indexes to run to completion, and [pod and container failure handling](https://kubernetes.io/docs/concepts/workloads/controllers/job/#handling-pod-and-container-failures).
+
+In the above mentioned cases, there is no distinction between the concept of "how many completed 
+indexes do I need" (i.e., `spec.completions`) and the concept of "how many pods do I want running
+at a time" (i.e., `spec.parallelism`). All indexes are expected to start at the same time, but they
+can finish at different times.
+
+The above behavior can be modeled using an Indexed job with
+- `spec.completions` equal to `spec.parallelism`
+- mutable `spec.completions` and `spec.parallelsim` as long as they are mutated together.
+
+<!--
+This section is for explicitly listing the motivation, goals, and non-goals of
+this KEP.  Describe why the change is important and the benefits to users. The
+motivation section can optionally provide links to [experience reports] to
+demonstrate the interest in a KEP within the wider Kubernetes community.
+
+[experience reports]: https://github.com/golang/go/wiki/ExperienceReports
+-->
+
+### Goals
+
+- Allow mutating `spec.completions` for `Indexed` job iff the updated value is equal to `spec.parallelism`.
+
+<!--
+List the specific goals of the KEP. What is it trying to achieve? How will we
+know that this has succeeded?
+-->
+
+### Non-Goals
+
+- Change the existing behavior of `NonIndexed` mode.
+- Change the existing behavior of `Indexed` mode for jobs that never mutate 
+  `.spec.completions` and `.spec.parallelism` together.
+
+<!--
+What is out of scope for this KEP? Listing non-goals helps to focus discussion
+and make progress.
+-->
+
+## Proposal
+
+The proposal is to allow mutating `spec.completions` for `Indexed` Job when equal to and updated 
+in tandem with `spec.parallelsim`.
+
+Success and failure semantics are not changed for jobs that never mutate `spec.completions`; 
+however, for ones that do, the semantics are changed in the following manner:
+
+Failures will always count. If `spec.completions` was scaled down, then previous pod failures
+outside the new range will still count against the job's backoffLimit. Morever, the field `status.Failed` will not be
+decremented if previously failed pods are now outside the range. However, `status.Succeeded` will be updated to reflect
+the number of successful indexes within the range as defined below.
+
+In the case where an index successfully completes, and then the job is scaled down rendering the index
+out of range, and then the job is scaled up again including back the index in the range, then the index 
+will be restarted.
+
+
+<!--
+This is where we get down to the specifics of what the proposal actually is.
+This should have enough detail that reviewers can understand exactly what
+you're proposing, but should not include things like API designs or
+implementation. What is the desired outcome and how do we measure success?.
+The "Design Details" section below is for the real
+nitty-gritty.
+-->
+
+### User Stories (Optional)
+
+<!--
+Detail the things that people will be able to do if this KEP is implemented.
+Include as much detail as possible so that people can understand the "how" of
+the system. The goal here is to make this feel real for users without getting
+bogged down.
+-->
+
+#### Story 1
+
+I have a batch workload that requires two sets of pods: a manager and a group of workers that can scale up or down. 
+The manager communicates with the workers to manage their load; examples of such workloads include distributed HPC 
+and districuted ML training. Examples of frameworks include Horovod with MPI, Spark and Ray.
+
+This workload can be modeled as two `Indexed` Jobs, one for the manager and one for the workers. A 
+headless service is created to set up stable hostnames for the worker pods. The workers job is
+scaled up/down by updating `spec.completions` and `spec.parallelism` in tandem.
+
+The success semantics of these workloads are tied to the manager, not the workers. However, in
+such workloads, when the job is finished the workers are either deleted directly (effectively scaled down to 0)
+or they potentially exit on their own after communicating their final status to the manager who decides the final status
+of the whole workload.
+
+
+<!--
+
+### Notes/Constraints/Caveats (Optional)
+
+What are the caveats to the proposal?
+What are some important details that didn't come across above?
+Go in to as much detail as necessary here.
+This might be a good place to talk about core concepts and how they relate.
+-->
+
+### Risks and Mitigations
+
+There is a concern that changing completion semantics on the fly can be subject to
+race conditions with declaring the job finished. However, after a closer look, this 
+seems to not be an issue because the job controller uses Update when setting the
+finished status on the job object, which is subject to object modified errors if the spec
+changes at the same time. Moreover, `spec.completions` will be immutable once a job has a
+finished condition.
+
+<!--
+What are the risks of this proposal, and how do we mitigate? Think broadly.
+For example, consider both security and how this will impact the larger
+Kubernetes ecosystem.
+
+How will security be reviewed, and by whom?
+
+How will UX be reviewed, and by whom?
+
+Consider including folks who also work outside the SIG or subproject.
+-->
+
+## Design Details
+
+The controller already counts the number of completed indexes within the range defined by `spec.completions` and
+uses that to evaluate if the job completed successfully and to set the `status.Succeeded` count. This is not impacted
+if `spec.completions` changes over time. Morever, on scale down, the controller also automatically removes any indexes
+outside the range. 
+
+As mentioned above, pod failures may have happened to indexes that are outside the range after `spec.completions`
+is updated. Those failures will still count against the backoffLimit of the job and the field `status.Failed` will
+not be decremented.
+
+The only needed change is to modify Job update validation in the kube-apiserver to allow updating an `Indexed` Job's
+`spec.completions` if `spec.completions=spec.parallelism` in both the old and new instance of the Job object.
+
+<!--
+This section should contain enough information that the specifics of your
+change are understandable. This may include API specs (though not always
+required) or even code snippets. If there's any ambiguity about HOW your
+proposal will be implemented, this is the place to discuss them.
+-->
+
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+The goal is to ensure that we don't accept enhancements with inadequate testing.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+[x] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+##### Unit tests
+
+<!--
+In principle every added code should have complete unit test coverage, so providing
+the exact set of tests will not bring additional value.
+However, if complete unit test coverage is not possible, explain the reason of it
+together with explanation why this is acceptable.
+-->
+
+<!--
+Additionally, for Alpha try to enumerate the core package you will be touching
+to implement this enhancement and provide the current unit coverage for those
+in the form of:
+- <package>: <date> - <current test coverage>
+The data can be easily read from:
+https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit
+
+This can inform certain test coverage improvements that we want to do before
+extending the production code to implement this enhancement.
+-->
+
+- `k8s.io/kubernetes/pkg/controller/job`: `1/10/2023` - `90%`
+- `k8s.io/kubernetes/pkg/apis/batch/validation`: `1/10/2023` - `96%`
+
+##### Integration tests
+
+For an `Indexed` Job test that:
+- `spec.completions` is only mutable if equal to `spec.parallelism`
+- When scaled down, higher indexes are removed first.
+- When scaled down to zero, the job is marked as finished successfully.
+- When scaled up, the range is extended and new higher indexes are created.
+- Success, failure and completion semantics are as discussed above.
+
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+- <test>: <link to test coverage>
+-->
+
+
+##### e2e tests
+
+Coverage is achieved via integration tests. But we will add one e2e test:
+- Job is allowed to scale down, then up, then successfully finish when all pods in the new range exit with success
+
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+- <test>: <link to test coverage>
+-->
+
+
+### Graduation Criteria
+
+Since the change is only relaxing validation that doesn't require multiple releases and
+requires no changes to the job controller, we will start the feature in beta and enabled
+by default since we will not get a lot of value from an alpha release. Just like we 
+successfully did for [mutable scheduling directives](https://github.com/kubernetes/enhancements/issues/2926).
+
+Note that if tests reveals a required change that invalidates the above understanding, then we will
+revert and start in Alpha.
+
+#### Beta
+- Validation logic in place to allow mutating `spec.completions` in tandem with `.spec.parallelism`.
+
+#### GA
+- Fix any potentially reported bugs.
+
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Define graduation milestones.
+
+These may be defined in terms of API maturity, [feature gate] graduations, or as
+something else. The KEP should keep this high-level with a focus on what
+signals will be looked at to determine graduation.
+
+Consider the following in developing the graduation criteria for this enhancement:
+- [Maturity levels (`alpha`, `beta`, `stable`)][maturity-levels]
+- [Feature gate][feature gate] lifecycle
+- [Deprecation policy][deprecation-policy]
+
+Clearly define what graduation means by either linking to the [API doc
+definition](https://kubernetes.io/docs/concepts/overview/kubernetes-api/#api-versioning)
+or by redefining what graduation means.
+
+In general we try to use the same stages (alpha, beta, GA), regardless of how the
+functionality is accessed.
+
+[feature gate]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[maturity-levels]: https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md#alpha-beta-and-stable-versions
+[deprecation-policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
+
+Below are some examples to consider, in addition to the aforementioned [maturity levels][maturity-levels].
+
+#### Alpha
+
+- Feature implemented behind a feature flag
+- Initial e2e tests completed and enabled
+
+#### Beta
+
+- Gather feedback from developers and surveys
+- Complete features A, B, C
+- Additional tests are in Testgrid and linked in KEP
+
+#### GA
+
+- N examples of real-world usage
+- N installs
+- More rigorous forms of testing—e.g., downgrade tests and scalability tests
+- Allowing time for feedback
+
+**Note:** Generally we also wait at least two releases between beta and
+GA/stable, because there's no opportunity for user feedback, or even bug reports,
+in back-to-back releases.
+
+**For non-optional features moving to GA, the graduation criteria must include
+[conformance tests].**
+
+[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
+
+#### Deprecation
+
+- Announce deprecation and support policy of the existing flag
+- Two versions passed since introducing the functionality that deprecates the flag (to address version skew)
+- Address feedback on usage/changed behavior, provided on GitHub issues
+- Deprecate the flag
+-->
+
+### Upgrade / Downgrade Strategy
+
+N/A, when the feature is enabled, mutating completions will be allowed, when disabled (via 
+a downgrade or the feature flag) mutating completions will be rejected.
+
+<!--
+If applicable, how will the component be upgraded and downgraded? Make sure
+this is in the test plan.
+
+Consider the following in developing an upgrade/downgrade strategy for this
+enhancement:
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to maintain previous behavior?
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to make use of the enhancement?
+-->
+
+### Version Skew Strategy
+
+N/A. This feature doesn't impact nodes.
+
+<!--
+If applicable, how will the component handle version skew with other
+components? What are the guarantees? Make sure this is in the test plan.
+
+Consider the following in developing a version skew strategy for this
+enhancement:
+- Does this enhancement involve coordinating behavior in the control plane and
+  in the kubelet? How does an n-2 kubelet without this feature available behave
+  when this feature is used?
+- Will any other components on the node change? For example, changes to CSI,
+  CRI or CNI may require updating that component before the kubelet.
+-->
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+The feature can be safely rolled back.
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+
+Documentation is available on [feature gate lifecycle] and expectations, as
+well as the [existing list] of feature gates.
+
+[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
+-->
+
+- [x] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: ElasticIndexedJob
+  - Components depending on the feature gate: kube-apiserver
+- [ ] Other
+  - Describe the mechanism:
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled).
+
+###### Does enabling the feature change any default behavior?
+
+No.
+
+
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+Yes. If disabled, kube-apiserver will reject mutating requests to `spec.completions` for `Indexed` jobs. 
+For Jobs that were previously mutated, then the only implication is that future mutating requests will be
+rejected.
+
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+Feature gates are typically disabled by setting the flag to `false` and
+restarting the component. No other changes should be necessary to disable the
+feature.
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+The api-server will accept mutation requests to `spec.completions` for `Indexed` job. 
+
+###### Are there any tests for feature enablement/disablement?
+
+N/A
+
+<!--
+The e2e framework does not currently support enabling or disabling feature
+gates. However, unit tests in each component dealing with managing data, created
+with and without the feature, are necessary. At the very least, think about
+conversion tests if API types are being modified.
+
+Additionally, for features that are introducing a new API field, unit tests that
+are exercising the `switch` of feature gate itself (what happens if I disable a
+feature gate after having objects written with the new field) are also critical.
+You can take a look at one potential example of such test in:
+https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
+-->
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+The change is opt-in, it doesn't impact already running workloads.
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+N/A
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+It will be tested manually prior to beta launch.
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+No.
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+Attempt to mutate `spec.completions` for `Indexed` job.
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason: 
+- [ ] API .status
+  - Condition name: 
+  - Other field: 
+- [x] Other (treat as last resort)
+  - Details:  `spec.completions` for `Indexed` jobs are successfully mutated according to the semantics described above.
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+N/A
+
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [X] Metrics
+  - Metric name: apiserver_request_total[resource=job, group=batch, verb=UPDATE, code=400]
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+No.
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+Not on of itself.
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+No.
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+No.
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+No.
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+No.
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+No.
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+Create requests will be rejected.
+
+###### What are other known failure modes?
+
+In a multi-master setup, when the cluster has skewed apiservers, some create requests
+may get accepted and some may get rejected.
+
+Detection: failed update requests; metric that an operator can monitor is apiserver_request_total[resource=job, group=batch, verb=UPDATE, code=403]
+Diagnostics: apiserver logs indicating rejected/accepted job update requests
+Testing: no testing, this is a transient state until all instances are on the same k8s version. 
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+N/A.
+
+## Implementation History
+
+- 2023-01-10: Proposed KEP.
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+<!--
+## Drawbacks
+
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+### Use StatefulSets
+Jobs have different success and retry semantics compared to StatefulSet that makes it better suited
+for batch workloads.
+
+
+### Make spec.completions fully mutable
+Making `spec.completions` fully mutable has no clear use case, and it is in theory a larger scope change. 
+Scaling a job is not related to how many pods I want completed, it is more about how many pods I want active,
+and so tying it to parallelism makes sense. 
+
+### Allow spec.completions=nil for Indexed mode
+`spec.completions=nil` is a special case that currently allowed only for `NonIndexed` mode.
+The success and completions semantics where designed to address the [work queue pattern](https://kubernetes.io/docs/tasks/job/fine-parallel-processing-work-queue/);
+specifically, a job is marked as successful if at least one pod exits successfully, and all 
+remaining pods complete and the number of failures do not exceed `spec.backoffLimit`.
+Currently `spec.completions=nil` is not allowed for `Indexed` mode, and so an alternative is to allow it and use `spec.parallelism` to
+define the range of indexes. However, this has the following drawbacks:
+- the success semantics are not compatible with the use cases we are targeting
+- falling back to `spec.parallelism` to define the range of indexes changes how job completion criteria works for `Indexed` job.
+- Relaxing validation to allow for a nil value may not be a safe change to API clients that expected completions to not be nil.
+
+While we could change the success criteria to work better for our use case, and we could do the change under a new `spec.completionMode`
+to ensure that the change is safe to clients, making `spec.completions` mutable seems like the clearer API.
+
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
+
+<!--
+## Infrastructure Needed (Optional)
+
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
diff --git a/keps/sig-apps/3715-elastic-indexed-job/kep.yaml b/keps/sig-apps/3715-elastic-indexed-job/kep.yaml
new file mode 100644
index 000000000000..b7fe39bcb058
--- /dev/null
+++ b/keps/sig-apps/3715-elastic-indexed-job/kep.yaml
@@ -0,0 +1,38 @@
+title: Elastic Indexed Job
+kep-number: 3715
+authors:
+  - "@ahg-g"
+owning-sig: sig-apps
+status: implementable
+creation-date: 2023-01-10
+reviewers:
+  - "@alculquicondor"
+  - "@liggitt"  
+approvers:
+  - "@soltysh"
+
+see-also:
+  - "/keps/sig-apps/2214-indexed-job"
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: beta
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.27"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  beta: "1.27"
+  stable: "v1.28"
+
+feature-gates:
+  - name: ElasticIndexedJob
+    components:
+      - kube-apiserver
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics:
+
diff --git a/keps/sig-auth/3257-trust-anchor-sets/README.md b/keps/sig-auth/3257-trust-anchor-sets/README.md
index 0913484d6273..0b8611daebb0 100644
--- a/keps/sig-auth/3257-trust-anchor-sets/README.md
+++ b/keps/sig-auth/3257-trust-anchor-sets/README.md
@@ -319,11 +319,11 @@ that happens.
 
 ### Risks and Mitigations
 
-Scalability: In the limit, ClusterTrustBundle objects will be used by
-every pod in the cluster, which will require watches from all Kubelets.  When
-they are updated, workloads will need to receive the updates fairly quickly
-(within 5 minutes across the whole cluster), to accommodate emergency rotation
-of trust anchors for a private CA.
+Scalability: In the limit, ClusterTrustBundle objects will be used by every pod
+in the cluster, which will require one ClusterTrustBundle watch from each
+Kubelet in the cluster.  When they are updated, workloads will need to receive
+the updates fairly quickly (within 5 minutes across the whole cluster), to
+accommodate emergency rotation of trust anchors for a private CA.
 
 Security: Should individual trust anchor set entries designate an OSCP endpoint
 to check for certificate revociation?  Or should we require the URL to be
@@ -853,13 +853,8 @@ and creating new ones, as well as about cluster-level services (e.g. DNS):
 
 Yes.
 
-A pod that uses a pemTrustAnchors projected volume will result in an additional
-watch on the named ClusterTrustBundle object orginating from Kubelet.  This
-watch will be low-throughput.
-
-Similar to the existing kubelet watches on secrets and configmaps, special care
-will need to be taken to ensure that kube-apiserver can efficiently choose which
-single-ClusterTrustBundle watches to update for a given etcd update.
+Kubelet will open a watch on ClusterTrustBundle objects.  This watch will be
+low-throughput.
 
 ###### Will enabling / using this feature result in introducing new API types?
 
@@ -894,12 +889,11 @@ arbitrary startup latency for my pod and cause an SLO breach.)
 Use of the ClusterTrustBundle objects by themselves should have negligible
 resource impact.
 
-Use of the pemTrustAnchors projected volume type will result in an additional
-watch on kube-apiserver for each unique (Node, ClusterTrustBundle) tuple in the
-cluster.  This is similar to existing kubelet support for projecting configmaps
-and secrets into a pod.  Just like for secrets and configmaps, we will need to
-make sure that kube-apiserver has the indexes it needs to efficiently map etcd
-watch events to Kubernetes watch channels.
+When the feature gate is enabled, Kubelet will open a watch on all
+ClusterTrustBundle objects in the cluster.  We expect there to be a low number
+of ClusterTrustBundle objects that does not scale with the number of nodes or
+workloads in the cluster, although individual ClusterTrustBundle objects could
+be large.
 
 ### Troubleshooting
 
diff --git a/keps/sig-auth/3257-trust-anchor-sets/kep.yaml b/keps/sig-auth/3257-trust-anchor-sets/kep.yaml
index 9d24d2261bfb..e4fb03991332 100644
--- a/keps/sig-auth/3257-trust-anchor-sets/kep.yaml
+++ b/keps/sig-auth/3257-trust-anchor-sets/kep.yaml
@@ -8,9 +8,11 @@ participating-sigs:
 status: implementable
 creation-date: 2022-02-16
 reviewers:
-  - TBD
+  - liggitt
+  - enj
 approvers:
-  - TBD
+  - liggitt
+  - enj
 
 ##### WARNING !!! ######
 # prr-approvers has been moved to its own location
@@ -25,11 +27,11 @@ stage: alpha
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.26"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
-  alpha: "v1.26"
+  alpha: "v1.27"
   beta: ""
   stable: ""
 
diff --git a/keps/sig-auth/3325-self-subject-attributes-review-api/README.md b/keps/sig-auth/3325-self-subject-attributes-review-api/README.md
index da707967e3c4..3e3892e12a8c 100644
--- a/keps/sig-auth/3325-self-subject-attributes-review-api/README.md
+++ b/keps/sig-auth/3325-self-subject-attributes-review-api/README.md
@@ -243,16 +243,33 @@ We expect no non-infra related flakes in the last month as a GA graduation crite
 
 #### Alpha
 
+- `SelfSubjectReview` endpoint is introduced in `authentication.k8s.io/v1alpha1` API
 - Feature implemented behind a feature flag
 - Initial unit and integration tests completed and enabled
+- Corresponding kubectl command implemented: `kubectl alpha auth whoami`
 
 #### Beta
 
 - Gather feedback from users
+- `SelfSubjectReview` is promoted to `authentication.k8s.io/v1beta1` API (Beta APIs are not enabled by default, [see](https://github.com/kubernetes/enhancements/blob/master/keps/sig-architecture/3136-beta-apis-off-by-default/README.md)).
+- Promote feature gate to Beta and make it enabled by default
+- Unit tests coverage improved
+- `kubectl alpha auth whoami` command uses `authentication.k8s.io/v1beta1` API, falls back to `authentication.k8s.io/v1alpha1` API
+- Fix [documentation](https://kubernetes.io/docs/reference/access-authn-authz/authentication/#self-subject-review):
+  - Change API version
+  - Rewrite conditions to enable the feature
 
 #### GA
 
-- Corresponding kubectl command implemented
+- `SelfSubjectReview` is promoted to `authentication.k8s.io/v1` API and enable by default
+- Promote feature gate to Stable
+- `kubectl alpha auth whoami` replaced with `kubectl auth whoami`
+- `kubectl auth whoami` command prefers `authentication.k8s.io/v1` API over `authentication.k8s.io/v1beta1` and `authentication.k8s.io/v1alpha1`
+- More integration and e2e tests cases
+- Fix [documentation](https://kubernetes.io/docs/reference/access-authn-authz/authentication/#self-subject-review):
+  - Change API version
+  - Rewrite conditions to enable the feature
+  - Change kubectl command
 
 NOTE: Should not be a part of [conformance tests](https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md).
 The fact that a user possesses a token does not necessarily imply the power to know to whom that token belongs.
@@ -263,22 +280,9 @@ The fact that a user possesses a token does not necessarily imply the power to k
 
 ###### How can this feature be enabled / disabled in a live cluster?
 
-<!--
-Pick one of these and delete the rest.
--->
-
-- Feature gate
+- [X] Feature gate (also fill in values in `kep.yaml`)
   - Feature gate name: `APISelfSubjectReview`
-  - Components depending on the feature gate:
-    - kube-apiserver
-
-```go
-FeatureSpec{
-	Default: false,
-	LockToDefault: false,
-	PreRelease: featuregate.Alpha,
-}
-```
+  - Components depending on the feature gate: `kube-apiserver`
 
 ###### Does enabling the feature change any default behavior?
 
diff --git a/keps/sig-auth/3325-self-subject-attributes-review-api/kep.yaml b/keps/sig-auth/3325-self-subject-attributes-review-api/kep.yaml
index 5a40f31de61b..303aff5f4e54 100755
--- a/keps/sig-auth/3325-self-subject-attributes-review-api/kep.yaml
+++ b/keps/sig-auth/3325-self-subject-attributes-review-api/kep.yaml
@@ -10,12 +10,14 @@ reviewers:
 - "@enj"
 - "@deads2k"
 - "@mikedanese"
+- "@liggitt"
 approvers:
-- TBD
+- "@deads2k"
+- "@liggitt"
 creation-date: "2022-05-30"
 status: implementable
-stage: alpha
-latest-milestone: "v1.26"
+stage: beta
+latest-milestone: "v1.27"
 milestone:
   alpha: "v1.26"
   beta: "v1.27"
diff --git a/keps/sig-auth/3766-referencegrant/README.md b/keps/sig-auth/3766-referencegrant/README.md
new file mode 100644
index 000000000000..587751c04060
--- /dev/null
+++ b/keps/sig-auth/3766-referencegrant/README.md
@@ -0,0 +1,941 @@
+# KEP-3766: Move ReferenceGrant to SIG Auth API Group
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [Risks and Mitigations](#risks-and-mitigations)
+    - [No Default Implementation](#no-default-implementation)
+    - [Potential for Variations Among Implementations](#potential-for-variations-among-implementations)
+    - [Cross-Namespace References may Weaken Namespace Boundaries](#cross-namespace-references-may-weaken-namespace-boundaries)
+- [Design Details](#design-details)
+  - [General Notes](#general-notes)
+    - [<code>ReferenceGrant</code> is half of a handshake](#-is-half-of-a-handshake)
+    - [ReferenceGrant authors must have sufficient access](#referencegrant-authors-must-have-sufficient-access)
+    - [<code>Resource</code> vs <code>Kind</code>](#-vs-)
+    - [Revocation behavior](#revocation-behavior)
+  - [Example Usage](#example-usage)
+    - [Gateway API Gateway Referencing Secret](#gateway-api-gateway-referencing-secret)
+    - [Gateway API HTTPRoute Referencing Service](#gateway-api-httproute-referencing-service)
+    - [PersistentVolumeClaim using cross namespace data source](#persistentvolumeclaim-using-cross-namespace-data-source)
+  - [API Spec](#api-spec)
+  - [Outstanding questions and clarifications](#outstanding-questions-and-clarifications)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Beta](#beta)
+    - [GA](#ga)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Open Questions](#open-questions)
+  - [Do We Need Verbs?](#do-we-need-verbs)
+  - [Do We Need Any or Wildcard Selectors?](#do-we-need-any-or-wildcard-selectors)
+  - [Do We Need Label Selectors?](#do-we-need-label-selectors)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+[ReferenceGrant](https://gateway-api.sigs.k8s.io/api-types/referencegrant/) was
+developed by the [Gateway API subproject](https://gateway-api.sigs.k8s.io/) to
+enable certain object references to cross namespaces. More recently, it has also
+been [used by
+sig-storage](https://kubernetes.io/blog/2023/01/02/cross-namespace-data-sources-alpha/)
+to enable cross-namespace data sources.
+
+This KEP proposes moving ReferenceGrant from its current
+`gateway.networking.k8s.io` API group into the `authorization.k8s.io` API
+group.
+
+## Motivation
+
+Any project that wants to enable cross-namespace references currently has to choose
+between introducing a dependency on Gateway API's ReferenceGrant or creating a
+new API that would be partially redundant (leading to confusion for users).
+	
+Recent interest between SIGs has made it clear that ReferenceGrant is wanted for use
+cases other than Gateway API. We would like to move ReferenceGrant to a neutral home
+(ideally, under SIG Auth) in order to make it the canonical API for managing references
+across namespaces.
+
+### Goals
+
+* Move ReferenceGrant to an API Group that SIG Auth manages
+* Clearly define how ReferenceGrant should be used, including both current use
+  cases and guidance for future use cases
+* Implement a library to ensure that ReferenceGrant is implemented consistently
+  by all controllers
+
+### Non-Goals
+
+* Develop an authorizer that will automatically implement ReferenceGrant for all
+  use cases. (It would be impossible to represent concepts like "all namespaces"
+  or label selectors that have become important for this KEP).
+
+## Proposal
+
+Move the existing ReferenceGrant resource into a new
+`authorization.k8s.io` API group, defined within the Kubernetes code base
+as part of the 1.27 release.
+
+We will take this opportunity to clarify and update the API after SIG Auth
+feedback. This resource will start with v1alpha1 as the API version.
+
+
+### Risks and Mitigations
+
+#### No Default Implementation
+Similar to the Ingress and Gateway APIs, this API will be dependent on
+implementations by controllers that are not included by default in Kubernetes.
+This could lead to confusion for users. We'll need to rely heavily on
+documentation for this feature, tracking all uses of official Kubernetes APIs
+that support ReferenceGrant in a central place.
+
+#### Potential for Variations Among Implementations
+Because this relies on each individual controller to implement the logic,
+it is possible that implementations may become inconsistent. To avoid that,
+we'll provide a standard library for implementing ReferenceGrant. We'll
+also strongly recommend that every API that relies on ReferenceGrant 
+includes robust conformance tests covering this functionality. Existing
+Gateway API conformance tests can serve as a model for this.
+
+#### Cross-Namespace References may Weaken Namespace Boundaries
+Although we believe that the handshake required for cross-namespace references
+with ReferenceGrant ensures these references will be safe, it does potentially
+weaken existing namespace boundaries. We believe ReferenceGrant will have a net
+benefit on the ecosystem as it will allow workloads, secrets, and configuration
+to be deployed in separate namespaces that more clearly match up with desired
+authorization.
+
+## Design Details
+
+### General Notes
+
+#### `ReferenceGrant` is half of a handshake
+
+When thinking about ReferenceGrant, it is important to remember that it does not
+do anything by itself. It *Grants* the *possibility* of making a *Reference*
+across namespaces. It's intended that _another object_ (that is, the From object)
+complete the handshake by creating a reference to the referent object (the To
+object).
+
+#### ReferenceGrant authors must have sufficient access
+
+Anyone creating or updating a ReferenceGrant MUST have read access to the resources
+they are providing access to. If that authorization check fails, the update or
+create action will also fail.
+
+<<[UNRESOLVED Do we need checks beyond read access? ]>>
+Previous Discussion: https://github.com/kubernetes/enhancements/pull/3767#discussion_r1084528657
+
+Comment that started that thread:
+
+does anything ensure the user creating the ReferenceGrant has permissions (read?
+write?) on the object they are granting access to? Translating the existing
+ReferenceGrant into an authz check means translating from Kind to Resource
+
+since this is extending "half of a handshake", it seems important to ensure the
+actor extending the handshake actually has permission to do so
+
+<<[/UNRESOLVED]>>
+
+#### `Resource` vs `Kind`
+
+When creating a metaresource (that is, a resource that targets other resources)
+like ReferenceGrant, it's important to consider if the metaresource uses the more
+common `Kind` or the more correct `Resource`.
+
+In the original Gateway API implementation, we chose to use `Kind` rather than
+`Resource`, mainly to improve the user experience. That is, it's easier users
+to take the value from the `kind` field at the top of the YAML they are already
+using, and put it straight into these fields, rather than needing to do a
+kind-resource lookup for every user's interaction with the API. @robscott even
+ended up making https://github.com/kubernetes/community/pull/5973 to clarify
+the API conventions.
+
+However, in discussion on this KEP, it's clear that the more generic nature of
+_this_ API requires the additional specificity that `Resource` provides.
+
+The Gateway API ReferenceGrant looked like this:
+```yaml
+apiVersion: gateway.networking.k8s.io/v1beta1
+kind: ReferenceGrant
+metadata:
+  name: allow-gateways
+  namespace: bar
+spec:
+  from:
+    # Note that in Gateway API, Group is currently defaulted
+    # to this, which means you to explicitly set the group to
+    # the empty string for Core resources. We should definitely
+    # change this.
+    - group: "gateway.networking.kubernetes.io"
+      kind: Gateway
+      namespace: foo
+  to:
+   - group: ""
+     kind: Secret
+```
+
+The new version will look like this instead:
+```yaml
+apiVersion: authorization.k8s.io/v1alpha1
+kind: ReferenceGrant
+metadata:
+  name: allow-gateways
+  namespace: bar
+spec:
+  from:
+    # Assuming that we leave the default for Group to the empty
+    # string, so that Core objects don't need additional config.
+    - group: "gateway.networking.kubernetes.io" 
+      resource: gateways
+      namespace: foo
+  to:
+    - group: ""
+      resource: secrets
+
+```
+
+The new version communicates the scope more clearly because `group`+`resource`
+is unambiguous and corresponds to exactly one set of objects on the API Server.
+
+This change also leaves room for an enhancement. Whether we have an in-tree or
+CRD implementation, we can rely on the exact matching that the plural resource
+name gives us, and [warn](https://kubernetes.io/blog/2020/09/03/warnings/) if
+either side of the grant is for an API that's not served by this cluster.
+
+#### Revocation behavior
+
+Unfortunately, there's no way to be specific about what happens when a
+ReferenceGrant is deleted in every possible case - the revocation behavior is
+dependent on what access is being granted (and revoked). With that said, we
+expect the following guidelines to be rules to apply to ALL implementations of
+the API:
+
+* Deletion of a ReferenceGrant means the granted access is revoked.
+* ReferenceGrant controllers must remove any configuration generated by the
+  granted access as soon as possible (eventual consistence permitting).
+* Some actions that have already been enabled by the ReferenceGrant (such as
+  forwarding requests or persisting data) cannot be undone, but no future
+  actions should be allowed.
+
+The examples below include information about what happens when the ReferenceGrant
+is removed as data points.
+
+### Example Usage
+
+#### Gateway API Gateway Referencing Secret
+
+In this example (from the Gateway API docs), we have a Gateway in the
+`gateway-api-example-ns1` namespace, referencing a Secret in the
+`gateway-api-example-ns2` namespace. The following ReferenceGrant allows this:
+
+```yaml
+apiVersion: gateway.networking.k8s.io/v1beta1
+kind: Gateway
+metadata:
+  name: cross-namespace-tls-gateway
+  namespace: gateway-api-example-ns1
+spec:
+  gatewayClassName: acme-lb
+  listeners:
+  - name: https
+    protocol: HTTPS
+    port: 443
+    hostname: "*.example.com"
+    tls:
+      # There's a Kind/Resource mismatch here, which sucks, but it is not
+      # easily fixable, since Gateway is already a beta, close to GA
+      # object.
+      certificateRefs:
+      - kind: Secret
+        group: ""
+        name: wildcard-example-com-cert
+        namespace: gateway-api-example-ns2
+---
+apiVersion: authorization.k8s.io/v1alpha1
+kind: ReferenceGrant
+metadata:
+  name: allow-ns1-gateways-to-ref-secrets
+  namespace: gateway-api-example-ns2
+spec:
+  from:
+  - group: gateway.networking.k8s.io
+    resource: gateways
+    namespace: gateway-api-example-ns1
+  to:
+  - group: ""
+    resource: secrets
+```
+
+For Gateway TLS references, if this ReferenceGrant is deleted (revoking, 
+the grant), then the Listener will become invalid, and the configuration
+will be removed as soon as possible (eventual consistency permitting).
+
+#### Gateway API HTTPRoute Referencing Service
+
+In this example, a HTTPRoute in the `baz` namespace is directing traffic
+to a Service backend in the `quux` namespace.
+
+```yaml
+apiVersion: gateway.networking.k8s.io/v1beta1
+kind: HTTPRoute
+metadata:
+  name: quuxapp
+  namespace: baz
+spec:
+  parentRefs:
+  - name: example-gateway
+    sectionName: https
+  hostnames:
+  - quux.example.com
+  rules:
+  - matches:
+    - path:
+        type: PathPrefix
+        value: /
+    # BackendRefs are Services by default.
+    backendRefs:
+    - name: quuxapp
+      namespace: quux
+      port: 80
+---
+apiVersion: authorization.k8s.io/v1alpha1
+kind: ReferenceGrant
+metadata:
+  name: allow-baz-httproutes
+  namespace: quux
+spec:
+  from:
+  - group: gateway.networking.k8s.io
+    resource: httproutes
+    namespace: baz
+  to:
+  - group: ""
+    resource: services
+```
+
+For HTTPRoute objects referencing a backend in another namespace, if the
+ReferenceGrant is deleted, the backend will become invalid (since the target
+can't be found). If there was more than one backend, then the valid parts of the
+HTTPRoute's config would persist in the data plane.
+
+But in this case, the cross-namespace reference is the _only_ backend, so the
+removal of the ReferenceGrant will also result in the removal of the HTTPRoute's
+config from the data plane.
+
+#### PersistentVolumeClaim using cross namespace data source
+
+This example is taken from https://kubernetes.io/blog/2023/01/02/cross-namespace-data-sources-alpha/
+and updated to use the proposed new spec.
+
+It allows the PersistentVolumeClaim in the `dev` namespace to use a volume
+snapshot from the `prod` namespace as its data source.
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: example-pvc
+  namespace: dev
+spec:
+  storageClassName: example
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+  dataSourceRef:
+    apiGroup: snapshot.storage.k8s.io
+    kind: VolumeSnapshot
+    name: new-snapshot-demo
+    namespace: prod
+  volumeMode: Filesystem
+---
+apiVersion: authorization.k8s.io/v1alpha1
+kind: ReferenceGrant
+metadata:
+  name: allow-prod-pvc
+  namespace: prod
+spec:
+  from:
+  - resource: persistentvolumeclaims
+    namespace: dev
+  to:
+  - group: snapshot.storage.k8s.io
+    resource: volumesnapshots
+    name: new-snapshot-demo
+```
+
+When a ReferenceGrant is deleted, any existing volumes created from the
+cross-namespace datasource will still persist, but new volumes will be
+rejected".
+
+### API Spec
+
+```golang
+// ReferenceGrant identifies kinds of resources in other namespaces that are
+// trusted to reference the specified kinds of resources in the same namespace
+// as the policy.
+//
+// Each ReferenceGrant can be used to represent a unique trust relationship.
+// Additional ReferenceGrants can be used to add to the set of trusted
+// sources of inbound references for the namespace they are defined within.
+//
+// ReferenceGrant is a form of runtime verification allowing users to assert
+// which cross-namespace object references are permitted. Implementations that
+// support ReferenceGrant MUST NOT permit cross-namespace references which have
+// no grant, and MUST respond to the removal of a grant by revoking the access
+// that the grant allowed. 
+//
+// Implementation of ReferenceGrant is eventually consistent, dependent on
+// watch events being received from the Kubernetes API. Although some processing
+// delay is inevitable, any updates that could result in revocation of access MUST
+// be considered high priority and handled as quickly as possible.
+//
+// Implementations of ReferenceGrant MUST treat all of the following scenarios
+// as equivalent:
+//
+// * A reference to a Namespace that doesn't exist
+// * A reference to a Namespace that exists and a Resource that doesn't exist
+// * A reference to Namespace and Resource that exists but a ReferenceGrant
+//   allowing the reference does not exist
+//
+// If any of the above occur, a generic error message such as "RefNotPermitted" 
+// should be communicated, likely via status on the referring resource.
+//
+// Support: Core
+type ReferenceGrant struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	// Spec defines the desired state of ReferenceGrant.
+	Spec ReferenceGrantSpec `json:"spec,omitempty"`
+
+	// Note that `Status` sub-resource has been excluded at the
+	// moment as it was difficult to work out the design.
+	// A `Status` sub-resource may be added in the future.
+}
+
+// +kubebuilder:object:root=true
+// ReferenceGrantList contains a list of ReferenceGrant.
+type ReferenceGrantList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitempty"`
+	Items           []ReferenceGrant `json:"items"`
+}
+
+// ReferenceGrantSpec identifies a cross namespace relationship that is trusted
+// for Gateway API.
+type ReferenceGrantSpec struct {
+	// From describes the trusted namespaces and kinds that can reference the
+	// resources described in "To". Each entry in this list MUST be considered
+	// to be an additional place that references can be valid from, or to put
+	// this another way, entries MUST be combined using OR.
+	//
+	// +kubebuilder:validation:MinItems=1
+	// +kubebuilder:validation:MaxItems=16
+	From []ReferenceGrantFrom `json:"from"`
+
+	// To describes the resources in this namespace that may be referenced by
+	// the resources described in "From". Each entry in this list MUST be
+	// considered to be an additional set of objects that references can be
+	// valid to, or to put this another way, entries MUST be combined using OR.
+	//
+	// +kubebuilder:validation:MinItems=1
+	// +kubebuilder:validation:MaxItems=16
+	To []ReferenceGrantTo `json:"to"`
+}
+
+```
+<<[UNRESOLVED Are "To" and "From" the right field names? ]>>
+Previous Discussion: https://github.com/kubernetes/enhancements/pull/3767#discussion_r1084671720
+
+Comments from @thockin:
+
+I would not argue for subject/object - that's confusing, and I love to take
+analogies too far.
+
+NetPol uses To/From but only for actual communications, and that still has been
+critiqued as perhaps "too cute".
+
+Subject/From isn't too bad, but not as symmetric. Subject/Referrer is correct
+but decidedly uncute. Subject/Origin?
+
+I hold opinions from an API review POV, but I'd like sig-auth to own the
+decision :)
+
+For reference, there was an [earlier
+discussion](https://groups.google.com/g/kubernetes-api-reviewers/c/ldmrXXQC4G4)
+on the kubernetes-api-reviewers mailing list that's also relevant to this.
+<<[/UNRESOLVED]>>
+
+```go
+
+// ReferenceGrantFrom describes trusted namespaces and kinds.
+type ReferenceGrantFrom struct {
+	// Group is the group of the referent.
+	// When empty, the Kubernetes core API group is inferred.
+	Group Group `json:"group"`
+
+	// Resource is the resource of the referent.
+	Resource string `json:"resource"`
+
+	// Namespace is the namespace of the referent.
+	Namespace string `json:"namespace"`
+}
+
+// ReferenceGrantTo describes what Kinds are allowed as targets of the
+// references.
+type ReferenceGrantTo struct {
+	// Group is the group of the referent.
+	// When empty, the Kubernetes core API group is inferred.
+	Group string `json:"group"`
+
+	// Resource is the resource of the referent.
+	Resource string `json:"resource"`
+
+	// Name is the name of the referent. When unspecified, this policy
+	// refers to all resources of the specified Group and Kind in the local
+	// namespace.
+	//
+	// +optional
+	Name *string `json:"name,omitempty"`
+}
+```
+
+### Outstanding questions and clarifications
+
+This section lists some of the outstanding questions people have had about
+ReferenceGrant, and other items that we'll be clarifying in the godoc and other
+documentation as part of the transition to the new API group, along with any
+other changes we need to make that aren't already reflected in this document.
+
+Also note that we don't consider any of these blockers for the general _idea_ of
+moving ReferenceGrant to the new API group, just notes to save discussion time.
+
+* Clarify that an implementation is required to reconcile ReferenceGrant for
+  specific `To` Kinds.
+* Corollary for future work, define how controllers interact. Is it a problem if
+  multiple controllers consume the same ReferenceGrant?
+* Status design is still pending, but it's currently expected that controllers
+  will indicate status on the _referring_ resources, not on ReferenceGrant itself.
+<<[UNRESOLVED do we need status? ]>>
+Original Thread: https://github.com/kubernetes/enhancements/pull/3767#discussion_r1084670421
+
+We want to be able to represent the following, in descending order of 
+importance: 
+
+1. Communicate that the ReferenceGrant is actively being used
+2. Communicate which controllers are using this ReferenceGrant
+3. Communicate how many times it's been used with sufficient granularity that I
+   can see the effects of my changes (if I remove this reference, am I removing
+   a dependency on this ReferenceGrant?)
+
+We could introduce a status structure that allowed each implementing controller
+to write 1 entry:
+
+```yaml
+status:
+  referencesAllowed:
+  - controllerName: gateway.example.com
+    numReferences: 1
+```
+
+@thockin responded with:
+
+If we think that the cardinality of controllers is low, we can put it in status.
+The downsides are:
+
+1. Could frequently require retries because of optimistic concurrency failures
+   (I'm trying to increment my count, but so is everyone else)
+1. If we're wrong about cardinality, there's not an easy way out
+1. Lots of writes to a resource that will be watched fairly often (every
+   controller which needs refs will watch all refgrants)
+1. We need .status.
+1. If we instead put that into a ReferenceGrantUse resource (just a tuple of
+   controller-name and count), then we only have optimistic concurrency
+   problems with ourselves, we have ~infinite cardinality, nobody will be
+   watching them, and RefGrant doesn't need .status.
+
+Downsides:
+
+1. It's another new resource
+1. It's a new pattern, untested in other places.
+
+<<[/UNRESOLVED]>>
+* Clarify that the expected operating model for implementations expects them to
+  have broad, read access to both ReferenceGrant and the specific `To` Kinds they
+  support, and then self-limit to only _use_ the relevant ones.
+* Decide whether to formally add `*` as a special value for Namespace, to mean
+  "all namespaces".
+
+### Test Plan
+
+[x] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+This is a net new resource to Kubernetes so it will not require any changes or
+additions to existing tests.
+
+##### Unit tests
+
+Unit tests will be used to cover:
+
+1. ReferenceGrant validation
+2. ReferenceGrant implementation library
+
+Test Cases of the ReferenceGrant implementation library will cover the
+following:
+
+* A reference to a Namespace that doesn't exist
+* A reference to a Namespace that exists and a Resource that doesn't exist
+* A reference to a Namespace and Resource that exists but a ReferenceGrant
+  allowing the reference does not exist
+* Multiple entries in both from and to entries within a ReferenceGrant
+* A ReferenceGrant that allows references to kinds of resources that do not
+  exist
+* Multiple ReferenceGrants with partially overlapping grants
+* Revocation of a ReferenceGrant with partially overlapping grants
+* A ReferenceGrant that does not specify `to.name`
+* A ReferenceGrant that includes overlapping grants for the same namespace both
+  with and without the resource name specified
+* A reference that has not been allowed by any ReferenceGrants
+* A ReferenceGrant that is ineffective due to the wrong `from.namespace` value
+* A ReferenceGrant that is ineffective due to the wrong `from.group` value
+* A ReferenceGrant that is ineffective due to the wrong `from.resource` value
+* A ReferenceGrant that is ineffective due to the wrong `to.group` value
+* A ReferenceGrant that is ineffective due to the wrong `to.resource` value
+* A ReferenceGrant that is ineffective due to the wrong `to.name` value
+* A ReferenceGrant that is ineffective due to being in the wrong namespace
+
+More details will be added as the details of the implementation library are
+clarified.
+
+##### Integration tests
+
+Before this graduates to beta, we will provide a reference implementation with a
+sample CRD that will be used to provide integration tests.
+
+##### e2e tests
+
+We will strongly encourage every API that uses ReferenceGrant to define
+conformance tests for their use of ReferenceGrant.
+
+### Graduation Criteria
+
+#### Beta
+
+[ ] Reference implementation with integration tests.
+[ ] Almost all of the fields and behavior have conformance test coverage in at least one project (for example Gateway API).
+
+#### GA
+
+[ ] Conformance tests that exercise all ReferenceGrant API calls (not the actual implementation of the API).
+[ ] Multiple implementations of this API passing all relevant conformance tests.
+
+### Upgrade / Downgrade Strategy
+
+N/A
+
+### Version Skew Strategy
+
+Version skew is a bit different here. Although we will provide a shared library
+for implementing this API, this will only be used by third-party controllers,
+not built-in components.
+
+There will be some implementations that support both the API defined by Gateway
+API and this API. Since these resources are entirely additive and can be
+duplicative, we can copy Gateway API resources to the new API group and delete
+the old Gateway API resources as part of a seamless migration. We expect that
+many implementations will provide this recommendation to users, and we may even
+provide tooling to simplify this process.
+
+<<[UNRESOLVED open questions that don't clearly fit elsewhere ]>>
+## Open Questions
+
+This KEP was merged in a provisional state with a number of open questions
+remaining. This section highlights the questions we have not resolved yet.
+
+### Do We Need Verbs?
+
+Previous Discussion: https://github.com/kubernetes/enhancements/pull/3767#discussion_r1084509958
+
+We could add a `verbs` field to enable users to specify the kind of referential
+access they want to grant. For example, we could define "read", "route", and
+"backup" as well-known verbs to start. We could also allow implementations to
+support additional domain-prefixed verbs.
+
+This would enable more precise grants, and potentially more open ended fields
+elsewhere in the resource, see the next item for more.
+
+### Do We Need Any or Wildcard Selectors?
+
+Previous Discussions:
+* https://github.com/kubernetes/enhancements/pull/3767#discussion_r1086020464
+* https://github.com/kubernetes/enhancements/pull/3767#discussion_r1086012665
+
+We already allow "Name" to be optional in `To`, effectively resulting in
+wildcard behavior. Should we expand that to allow any of the following?
+
+1. References to any group or resource
+1. References from any group or resource
+1. References from any namespace
+
+Historically, there has been pushback to allowing any group or resource
+because it could enable unknown or unintended kinds of access. If we
+introduced the concepts of "verbs" described above, this would become
+a moot point.
+
+### Do We Need Label Selectors?
+
+Previous Discussions:
+* https://github.com/kubernetes/enhancements/pull/3767#discussion_r1084492070
+* https://github.com/kubernetes/enhancements/pull/3767#discussion_r1084674648
+
+As a natural next step, instead of simply allowing all, we could use
+label selectors to enable:
+
+1. Access to resources with specific labels
+1. Access from namespaces with specific labels
+
+In both cases, this would significantly increase implementation
+complexity.
+
+As a potential middleground, we could explore a solution that left
+room for namespace selectors without actually including them. For example: 
+
+```go
+type ReferenceGrantFrom struct {
+  //...
+  Peer ReferenceGrantPeer
+}
+
+// Exatcly one field should be set. If none are found, clients must
+// assume that an unknown value was specified.
+type ReferenceGrantPeer {
+    Namespace *string
+    // Future: Namespace selector
+}
+```
+
+This assumes that group+resource supersedes namespaces - is that true? Or do we
+really want:
+
+```go
+type ReferenceGrant struct {
+    // ...
+    From ReferenceGrantPeer
+}
+
+type ReferenceGrantPeer struct {
+    // ...
+    Namespace *string
+    NamespaceResource *NamespaceResource
+}
+
+type NamespaceResource struct {
+    Namespace string
+    Group string
+    Resource string
+}
+```
+<<[/UNRESOLVED]>>
+
+
+## Production Readiness Review Questionnaire
+
+### Feature Enablement and Rollback
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+- [x] Other
+  - Describe the mechanism: Enable alpha ReferenceGrant API
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+    No
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled).
+    No
+
+###### Does enabling the feature change any default behavior?
+
+No
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+Yes
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+The API would become accessible again, implementing controllers may need to be
+restarted to pick up the presence of this API.
+
+###### Are there any tests for feature enablement/disablement?
+
+No
+
+### Rollout, Upgrade and Rollback Planning
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+API enablement may not work, but that would not be unique to this API.
+
+###### What specific metrics should inform a rollback?
+
+N/A, this is just an API
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+N/A, this is just an API
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+No
+
+### Monitoring Requirements
+
+N/A, this is just an API
+
+###### How can an operator determine if the feature is in use by workloads?
+```
+kubectl get referencegrants --all-namespaces
+```
+
+###### How can someone using this feature know that it is working for their instance?
+
+This will be dependent on the API that ReferenceGrant is used with. In Gateway API,
+each resource has clear status conditions that reflect the validity of a cross-namespace
+reference.
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+Changes to ReferenceGrants are processed by the shared library within 10 seconds 99% over a quarter.
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+Changes to ReferenceGrants are processed by the shared library within 10 seconds.
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+No
+
+### Dependencies
+
+N/A, this is just an API
+
+###### Does this feature depend on any specific services running in the cluster?
+
+- API Server
+
+### Scalability
+
+###### Will enabling / using this feature result in any new API calls?
+
+Yes, users may install controllers that watch for changes to ReferenceGrants.
+Users may also create ReferenceGrants to enable cross-namespace references.
+
+###### Will enabling / using this feature result in introducing new API types?
+
+API Type: ReferenceGrant
+Supported Number of Objects per Cluster: No limit 
+Supported Number of Objects per Namespace: No limit
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+No
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+No
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+No
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+No
+
+### Troubleshooting
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+The API would not be accessible. We would likely recommend that controllers revoke
+cross-namespace references if they could not find ReferenceGrants that allow them
+so this could result in a disruption for anything that relied on cross-namespace
+references.
+
+###### What are other known failure modes?
+
+N/A, this is just an API
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+N/A, this is just an API
+
+## Implementation History
+
+* July 2021: [ReferencePolicy is proposed in Gateway API](https://github.com/kubernetes-sigs/gateway-api/pull/711)
+* August 2021: [ReferencePolicy is added to Gateway API](https://github.com/kubernetes-sigs/gateway-api/pulls?page=2&q=is%3Apr+is%3Aclosed+ReferencePolicy)
+* June 2022: [ReferencePolicy is renamed to ReferenceGrant](https://github.com/kubernetes-sigs/gateway-api/pull/1179)
+* December 2022: [SIG-Storage uses ReferenceGrant for cross-namespace data storage sources](https://kubernetes.io/blog/2023/01/02/cross-namespace-data-sources-alpha/)
+* December 2022: [ReferenceGrant graduates to beta in Gateway API v0.6.0](https://github.com/kubernetes-sigs/gateway-api/releases/tag/v0.6.0)
+
+## Drawbacks
+
+N/A
+
+## Alternatives
+
+1. ReferenceGrant could remain as a CRD
+This would probably be fine, we just don't really have a good place for it to
+live. This could also complicate installation of Gateway API and other APIs that
+depended on this.
+
+2. Every API that wanted to support cross-namespace references could maintain their own version of ReferenceGrant
+This would be a confusing mess, we should avoid this at all costs.
+
diff --git a/keps/sig-auth/3766-referencegrant/kep.yaml b/keps/sig-auth/3766-referencegrant/kep.yaml
new file mode 100644
index 000000000000..6f8f282e1982
--- /dev/null
+++ b/keps/sig-auth/3766-referencegrant/kep.yaml
@@ -0,0 +1,29 @@
+title: Move ReferenceGrant to sig-auth API Group
+kep-number: 3766
+authors:
+  - "@robscott"
+  - "@youngnick"
+owning-sig: sig-auth
+participating-sigs:
+  - sig-network
+  - sig-storage
+status: provisional
+creation-date: 2023-01-20
+reviewers:
+  - "@enj"
+  - "@deads2k"
+  - "@liggitt"
+  - "@msau42"
+  - "@thockin"
+approvers:
+  - "@enj"
+  - "@liggitt"
+
+stage: alpha
+
+latest-milestone: "v1.27"
+
+milestone:
+  alpha: "v1.27"
+
+disable-supported: true
diff --git a/keps/sig-cli/2227-kubectl-default-container/README.md b/keps/sig-cli/2227-kubectl-default-container/README.md
index 8fd2b7e22652..0055da193fb3 100644
--- a/keps/sig-cli/2227-kubectl-default-container/README.md
+++ b/keps/sig-cli/2227-kubectl-default-container/README.md
@@ -16,6 +16,11 @@
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
   - [Test Plan](#test-plan)
+    - [Testing Strategy](#testing-strategy)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
   - [Graduation Criteria](#graduation-criteria)
     - [Alpha -&gt; Beta Graduation](#alpha---beta-graduation)
     - [Beta -&gt; GA Graduation](#beta---ga-graduation)
@@ -179,13 +184,37 @@ metadata:
 ```
 
 ### Test Plan
+
+#### Testing Strategy
+
 Add a unit test for each command, testing the behavior with the annotation, without the annotation, and with the --container flag
 
+##### Prerequisite testing updates
+
+##### Unit tests
+
+The main unit test is in package under `vendor/k8s.io/kubectl/pkg/`.
+
+- vendor/k8s.io/kubectl/pkg/cmd/exec:2023-01-13 - 68.9%
+- vendor/k8s.io/kubectl/pkg/cmd/util/helpers.go:2023-01-13 - 38.1%
+- vendor/k8s.io/kubectl/pkg/polymorphichelpers/logsforobject.go:2023-01-13 -  79%
+
+See details in <https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit&include-filter-by-regex=kubectl>.
+
+##### Integration tests
+
+N/A
+
+##### e2e tests
+
+[Kubectl logs default container logs](https://github.com/kubernetes/kubernetes/blob/02f893b6e28549a1008d4fc65c40990b87c07830/test/e2e/kubectl/logs.go#L163): <https://storage.googleapis.com/k8s-triage/index.html?sig=cli&test=default%20container%20logs>
+
 ### Graduation Criteria
 
 #### Alpha -> Beta Graduation
 
 As this is an opt-in feature, no gate is expected.
+
 - At least 2 release cycles pass to gather feedback and bug reports during
 - Documentations, add it to [well-known annotations docs](https://kubernetes.io/docs/reference/kubernetes-api/labels-annotations-taints/)
 - Add a warning deprecation message when using the annotation `kubectl.kubernetes.io/default-logs-container`
@@ -310,11 +339,26 @@ resource usage (CPU, RAM, disk, IO, ...) in any components?**
 
 ## Implementation History
 
-2021-03-10: Alpha implementation completion in 1.21
-- https://github.com/kubernetes/kubernetes/pull/97099
-- https://github.com/kubernetes/kubernetes/pull/99615
-- https://github.com/kubernetes/kubernetes/pull/99581
-- https://github.com/kubernetes/kubernetes/pull/99833
+2021-03-10(v1.21) Alpha: implementation completion
+
+- <https://github.com/kubernetes/kubernetes/pull/97099>
+- <https://github.com/kubernetes/kubernetes/pull/99615>
+- <https://github.com/kubernetes/kubernetes/pull/99581>
+- <https://github.com/kubernetes/kubernetes/pull/99833>
+
+2021-10-28(v1.24) Beta: make 'kubectl logs' default to the first container when default container cannot be 
+determined or found by annotations
+
+- <https://github.com/kubernetes/kubernetes/pull/105964>
+
+2022-04-05(v1.25): remove deprecated message of `kubectl.kubernetes.io/default-logs-container`
+
+- <https://github.com/kubernetes/kubernetes/pull/109254>
+
+2023-01-13(v1.27): add an e2e test case
+
+- <https://github.com/kubernetes/kubernetes/pull/115046>
+
 
 ## Drawbacks
 
diff --git a/keps/sig-cli/2227-kubectl-default-container/kep.yaml b/keps/sig-cli/2227-kubectl-default-container/kep.yaml
index 36cc39d41ec6..ec491dfb578d 100644
--- a/keps/sig-cli/2227-kubectl-default-container/kep.yaml
+++ b/keps/sig-cli/2227-kubectl-default-container/kep.yaml
@@ -6,7 +6,7 @@ owning-sig: sig-cli
 participating-sigs:
 status: implementable
 creation-date: 2020-12-16
-last-updated: 2021-03-10
+last-updated: 2023-01-20
 reviewers:
   - "@dougsland"
   - "@eddiezane"
@@ -19,18 +19,18 @@ approvers:
 see-also:
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: alpha
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.21"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.21"
-  beta: "v1.23"
-  stable: "v1.25"
+  beta: "v1.24"
+  stable: "v1.27"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
diff --git a/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components/kep.yaml b/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components/kep.yaml
index 34ba8626c52f..d08c499973d0 100644
--- a/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components/kep.yaml
+++ b/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components/kep.yaml
@@ -5,7 +5,7 @@ authors:
 owning-sig: sig-instrumentation
 participating-sigs:
   - sig-arch
-status: implementable
+status: implemented
 creation-date: 2021-07-30
 reviewers:
   - pohly
@@ -16,12 +16,12 @@ approvers:
 see-also:
   - "/keps/sig-instrumentation/1602-structured-logging"
 replaces: []
-stage: beta
-latest-milestone: "v1.24"
+stage: stable
+latest-milestone: "v1.26"
 milestone:
   alpha: "v1.23"
   beta: "v1.24"
-  stable: "v1.25"
+  stable: "v1.26"
 
 feature-gates: []
 disable-supported: true
diff --git a/keps/sig-instrumentation/3466-kubernetes-component-health-slis/README.md b/keps/sig-instrumentation/3466-kubernetes-component-health-slis/README.md
index ced1109680a6..b7e6db5153f2 100644
--- a/keps/sig-instrumentation/3466-kubernetes-component-health-slis/README.md
+++ b/keps/sig-instrumentation/3466-kubernetes-component-health-slis/README.md
@@ -128,20 +128,20 @@ checklist items _must_ be updated for the enhancement to be released.
 
 Items marked with (R) are required *prior to targeting to a milestone / release*.
 
-- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
-- [ ] (R) KEP approvers have approved the KEP status as `implementable`
-- [ ] (R) Design details are appropriately documented
-- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
-  - [ ] e2e Tests for all Beta API Operations (endpoints)
-  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
-  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [X] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [X] (R) KEP approvers have approved the KEP status as `implementable`
+- [X] (R) Design details are appropriately documented
+- [X] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [X] e2e Tests for all Beta API Operations (endpoints)
+  - [X] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [X] (R) Minimum Two Week Window for GA e2e tests to prove flake free
 - [ ] (R) Graduation criteria is in place
   - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
-- [ ] (R) Production readiness review completed
+- [X] (R) Production readiness review completed
 - [ ] (R) Production readiness review approved
-- [ ] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
-- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+- [X] "Implementation History" section is up-to-date for milestone
+- [X] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [X] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 <!--
 **Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
@@ -336,7 +336,6 @@ We will target this feature behind a flag `ComponentSLIs`
 
 ###### How can this feature be enabled / disabled in a live cluster?
 
-
 - [ ] Feature gate (also fill in values in `kep.yaml`)
   - Feature gate name: `ComponentSLIs`
   - Components depending on the feature gate:
@@ -346,6 +345,9 @@ We will target this feature behind a flag `ComponentSLIs`
     + controller-manager
     + kube-proxy
 
+###### Are there any tests for feature enablement/disablement?
+
+Given this feature is purely in-memory, no enablement/disablement tests are needed.
 
 ###### Does enabling the feature change any default behavior?
 
diff --git a/keps/sig-instrumentation/3498-extending-stability/README.md b/keps/sig-instrumentation/3498-extending-stability/README.md
index 55c2766e6c60..e62d0fd158af 100644
--- a/keps/sig-instrumentation/3498-extending-stability/README.md
+++ b/keps/sig-instrumentation/3498-extending-stability/README.md
@@ -85,6 +85,11 @@ tags, and then generate with `hack/update-toc.sh`.
 - [Proposal](#proposal)
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
+  - [Semantic of Stability Levels](#semantic-of-stability-levels)
+    - [Internal Metrics](#internal-metrics)
+    - [Alpha Metrics](#alpha-metrics)
+    - [Beta Metrics](#beta-metrics)
+    - [Stable Metrics](#stable-metrics)
   - [Test Plan](#test-plan)
       - [Prerequisite testing updates](#prerequisite-testing-updates)
       - [Unit tests](#unit-tests)
@@ -128,20 +133,20 @@ checklist items _must_ be updated for the enhancement to be released.
 
 Items marked with (R) are required *prior to targeting to a milestone / release*.
 
-- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
-- [ ] (R) KEP approvers have approved the KEP status as `implementable`
-- [ ] (R) Design details are appropriately documented
-- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
-  - [ ] e2e Tests for all Beta API Operations (endpoints)
+- [X] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [X] (R) KEP approvers have approved the KEP status as `implementable`
+- [X] (R) Design details are appropriately documented
+- [X] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [X] e2e Tests for all Beta API Operations (endpoints)
   - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
   - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
 - [ ] (R) Graduation criteria is in place
   - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
-- [ ] (R) Production readiness review completed
-- [ ] (R) Production readiness review approved
-- [ ] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
-- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+- [X] (R) Production readiness review completed
+- [X] (R) Production readiness review approved
+- [X] "Implementation History" section is up-to-date for milestone
+- [X] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [X] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 <!--
 **Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
@@ -185,6 +190,7 @@ Additionally we propose forced upgrades of metrics stability classes in the simi
 ### Risks and Mitigations
 
 The primary risk is that these changes break our existing (and working) metrics infrastructure. The mitigation should straightfoward, i.e. rollback the changes to the metrics framework. 
+
 ## Design Details
 
 Our plan is to add functionality to our static analysis framework which is hosted in the main `k8s/k8s` repo, under `test/instrumentation`. Specifically, we will need to support:
@@ -203,6 +209,35 @@ We will not attempt to parse metrics which:
 
 As an aside, much of this work has already been done, but is stashed in a local repo. 
 
+### Semantic of Stability Levels
+
+#### Internal Metrics
+
+`Internal` metrics have no stability guarantees and are **not** parseable by the static analysis framework. As such, `Internal` metrics will NOT be included in metric auto-documentation.
+
+#### Alpha Metrics
+
+Alpha metrics have no stability guarantees but are parseable by the static analysis framework. As such, `Alpha` metrics will be included in metric auto-documentation.
+
+#### Beta Metrics
+
+`Beta` metrics have *some* stability guarantees. Specifically, we guarantee that:
+
+ - `Beta` metrics will not be removed without first being explicitly deprecated. 
+    + you can deprecate Beta metrics at any point:
+      * if because of changes in underlying code/feature it's impossible to compute such metric the metric can be removed after one release
+      * if the metric is still possible to expose (we just think it's not the right one, e.g. we want to remove some label), but technically can still expose it, we leave it deprecated for 3 releases
+ - Furthermore, `Beta` metrics are guaranteed to be **forward compatible** in respect to alerts and queries which may be written against them. By "forward compatible", we mean that queries and alerts which are written against the metric and its labels will continue to work in the future. We ensure forward compatibility by ensuring that **labels can only be added**, *and not removed*, from `Beta` metrics.
+ - `Beta` metrics will be included in metric auto-documentation
+
+#### Stable Metrics
+
+`Stable` metrics have stability guarantees. Specifically, we guarantee that:
+
+ - `Stable` metrics will not be removed without first being explicitly deprecated. After deprecation, the metric will be removed in 12 months or 3 releases.
+ - Furthermore, `Stable` metrics are guaranteed to **not change** in respect to labels. This means labels can neither be added nor removed from a `Stable` metric.
+ - `Stable` metrics will be included in metric auto-documentation
+
 ### Test Plan
 
 We have static analysis testing for stable metrics, we will extend our test coverage 
@@ -218,12 +253,12 @@ We already have thorough testing for the stability framework which has been GA f
 
 ##### Unit tests
 
-[ ] parsing variables
-[ ] multi-line strings
-[ ] evaluating buckets
-[ ] buckets which are defined via variables and consts
-[ ] evaluation of simple consts
-[ ] evaluation of simple variables
+[X] parsing variables
+[X] multi-line strings
+[X] evaluating buckets
+[X] buckets which are defined via variables and consts
+[X] evaluation of simple consts
+[X] evaluation of simple variables
 
 - `test/instrumentation`: `09/20/2022` - `full coverage of existing stability framework`
 
@@ -245,11 +280,9 @@ The statis analysis tooling runs in a precommit pipeline and is therefore exempt
 
 #### Beta
 
-- All instances of `Alpha` metrics will be converted to `Internal`
-- Kubernetes metrics framework will be enhanced to support marking `Alpha` and `Beta` metrics with a date. The semantics of this are yet to be determined. This date will be used to statically determine whether or not that metric should be decrepated automatically or promoted.
-- Kubernetes metrics framework will be enhanced with a script to auto-deprecate metrics which have passed their window of existence as an `Alpha` or `Beta` metric
-- We will determine the semantics for `Alpha` and `Beta` metrics
-- The `beta` stage for this framework will be a few releases. During this time, we will evaluate the utility and the ergonomics of the framework, making adjustments as necessary
+- Kubernetes metrics framework will be enhanced to support marking `Alpha` and `Beta` metrics with release version. The semantics of this are yet to be determined. This version will be used to statically determine whether or not that metric should be deprecated automatically or promoted.
+
+For the beta version of this KEP, we begin permitting metrics to be promoted to the `Beta` stability class.
 
 #### GA
 
@@ -317,13 +350,12 @@ This should not affect upgrade/rollback paths.
 
 ###### How can an operator determine if the feature is in use by workloads?
 
-You can determine this by seeing if workloads depend on any Kubernetes control-plane metrics. If they do, they are using this feature.
+We've introduced a metric (i.e. `registered_metrics_total`) which should serve to indicate this feature is enabled.
 
 ###### How can someone using this feature know that it is working for their instance?
 
 They will be able to see metrics. 
 
-
 ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
 
 This tooling runs in precommit. It does not affect runtime SLOs.
@@ -334,7 +366,7 @@ N/A
 
 ###### Are there any missing metrics that would be useful to have to improve observability of this feature?
 
-`registered_metrics_total` will be used to calculate the number of registered stable metrics.
+No.
 
 ### Dependencies
 
@@ -342,7 +374,7 @@ Prometheus and the Kubernetes metric framework.
 
 ###### Does this feature depend on any specific services running in the cluster?
 
-In order to ingest these metrics, one needs a prometheus scraping agent and some backend to persist the metric data.
+No.
 
 ### Scalability
 
diff --git a/keps/sig-instrumentation/383-new-event-api-ga-graduation/README.md b/keps/sig-instrumentation/383-new-event-api-ga-graduation/README.md
index c54d5b7fa31a..82375f4c6aa9 100644
--- a/keps/sig-instrumentation/383-new-event-api-ga-graduation/README.md
+++ b/keps/sig-instrumentation/383-new-event-api-ga-graduation/README.md
@@ -25,7 +25,6 @@
     - [Deprecated Fields](#deprecated-fields)
     - [Beta to GA Graduation](#beta-to-ga-graduation)
     - [Load Test](#load-test)
-    - [Components Migration](#components-migration)
 - [Considerations](#considerations)
   - [Performance Impact](#performance-impact)
   - [Backward Compatibility](#backward-compatibility)
@@ -74,7 +73,7 @@ The KEP aims at fixing few issues in the current way Events are structured and i
 
 Most sections and design details are copied from the original design doc: [Make Kubernetes Events Useful and Safe](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/instrumentation/events-redesign.md).
 
-The new Event API has been promoted from v1beta1 to v1 in 1.19 and currently only scheduler was migrated to use it. This KEP proposes to graduate the new Event API by migrating all the remaining Kubernetes components to use the new Event API.
+The new Event API has been promoted from v1beta1 to v1 in 1.19 and currently only scheduler was migrated to use it.
 
 ## Motivation
 
@@ -370,7 +369,7 @@ Scalability and Performance:
 
 ### Graduation Criteria
 
-The new Event API is in v1 right now and scheduler has been migrated to use new API. The plan is to do a load test and migrate all the remaining components.
+The new Event API is in v1 right now and scheduler has been migrated to use new API. The plan is to do a load test before migrating all the remaining components.
 
 #### Deprecated Fields
 
@@ -383,27 +382,11 @@ This section lists the deprecated Event API fields that should be removed before
 
 - Remove deprecated fields listed above
 - Gather data from performance and scalability tests
-- Remove all in-tree use of the core/v1.Event API in favour of events.k8s.io/v1
 
 #### Load Test
 
 The idea is to use [ClusterLoader](https://github.com/kubernetes/perf-tests/tree/master/clusterloader2) testing framework to do a load test to make sure events generated can be handled by etcd and there's no big performance reduction.
 
-#### Components Migration
-
-The list of components that need to be migrated is shown as follows:
-
-- kubelet
-- cloud-controller-manager
-- kube-controller-manager
-- leader election
-- node problem detector
-- gce ingress controller
-- event exporter
-
-(Note: there are more out-of-tree components that need to be migrated)
-
-
 ## Considerations
 
 ### Performance Impact
@@ -745,4 +728,4 @@ We considered making easier for automated systems to use Events by enforcing "ac
 
 ### Pivoting Towards Making Events More Helpful for Cluster Operator During Debugging
 
-We considered exposing more data that cluster operator would need to use Events for debugging, e.g. making ReportingController more central to the semantics of Event and adding some way to easily grep though the logs of appropriate component when looking for context of a given Event. This idea was dropped because Events are supposed to give application developer who's running his application on the cluster a rough understanding what was happening with his app.
\ No newline at end of file
+We considered exposing more data that cluster operator would need to use Events for debugging, e.g. making ReportingController more central to the semantics of Event and adding some way to easily grep though the logs of appropriate component when looking for context of a given Event. This idea was dropped because Events are supposed to give application developer who's running his application on the cluster a rough understanding what was happening with his app.
diff --git a/keps/sig-instrumentation/383-new-event-api-ga-graduation/kep.yaml b/keps/sig-instrumentation/383-new-event-api-ga-graduation/kep.yaml
index 3d11ccd01f02..87f47409ec49 100644
--- a/keps/sig-instrumentation/383-new-event-api-ga-graduation/kep.yaml
+++ b/keps/sig-instrumentation/383-new-event-api-ga-graduation/kep.yaml
@@ -15,7 +15,7 @@ approvers:
   - "@brancz"
 creation-date: 2019-01-31
 last-updated: 2021-02-03
-status: implementable
+status: implemented
 see-also:
 replaces:
 stage: stable
diff --git a/keps/sig-network/2091-admin-network-policy/README.md b/keps/sig-network/2091-admin-network-policy/README.md
index 7c33b4ea415e..71764a010d09 100644
--- a/keps/sig-network/2091-admin-network-policy/README.md
+++ b/keps/sig-network/2091-admin-network-policy/README.md
@@ -11,6 +11,7 @@
     - [Actions](#actions)
     - [Priority](#priority)
     - [Rule Names](#rule-names)
+  - [BaselineAdminNetworkPolicy resource](#baselineadminnetworkpolicy-resource)
   - [User Stories](#user-stories)
     - [Story 1: Deny traffic at a cluster level](#story-1-deny-traffic-at-a-cluster-level)
     - [Story 2: Allow traffic at a cluster level](#story-2-allow-traffic-at-a-cluster-level)
@@ -25,7 +26,7 @@
 - [Design Details](#design-details)
   - [AdminNetworkPolicy API Design](#adminnetworkpolicy-api-design)
     - [General Notes on the AdminNetworkPolicy API](#general-notes-on-the-adminnetworkpolicy-api)
-    - [Further examples utilizing the self field for <code>NamespaceSet</code> objects](#further-examples-utilizing-the-self-field-for--objects)
+    - [Further examples utilizing the self field for <code>NamespacedPeer</code> objects](#further-examples-utilizing-the-self-field-for--objects)
   - [Sample Specs for User Stories](#sample-specs-for-user-stories)
     - [Sample spec for Story 1: Deny traffic at a cluster level](#sample-spec-for-story-1-deny-traffic-at-a-cluster-level)
     - [Sample spec for Story 2: Allow traffic at a cluster level](#sample-spec-for-story-2-allow-traffic-at-a-cluster-level)
@@ -158,19 +159,25 @@ with ipBlock rules that select external traffic.
 ## Proposal
 
 In order to achieve the three primary broad use cases for a cluster admin to
-secure K8s clusters, we propose to introduce the following resource
+secure K8s clusters, we propose to introduce the following resources
 under `policy.networking.k8s.io` API group:
 - AdminNetworkPolicy
+- BaselineAdminNetworkPolicy
 
-### AdminNetworkPolicy resource
-
-An AdminNetworkPolicy (ANP) resource will help the administrators:
+The AdminNetworkPolicy(ANP) and BaselineAdminNetworkPolicy(BANP) resources will
+help the administrators:
 1. Set strict security rules for the cluster, i.e. a developer CANNOT override
 these rules by creating NetworkPolicies that applies to the same workloads as the
 AdminNetworkPolicy does.
 2. Set baseline security rules that describes default connectivity for cluster
 workloads, which CAN be overridden by developer NetworkPolicies if needed.
 
+### AdminNetworkPolicy resource
+
+An AdminNetworkPolicy (ANP) resource will help the administrators set strict security
+rules for the cluster, i.e. a developer CANNOT override these rules by creating
+NetworkPolicies that applies to the same workloads as the AdminNetworkPolicy does.
+
 #### Actions
 
 Unlike the NetworkPolicy resource in which each rule represents an allowed
@@ -180,65 +187,51 @@ be read as-is, i.e. there will not be any implicit isolation effects for the Pod
 selected by the AdminNetworkPolicy, as opposed to what NetworkPolicy rules imply.
 
 - Pass: Traffic that matches a `Pass` rule will skip all further rules from all
-  positive priority (non-zero) numbered ANPs and instead be enforced by the K8s NetworkPolicies.
-  If there is no K8s NetworkPolicy rule match, and no ANP priority "0" rule
-  match (more on this in the [priority section](#priority)), traffic will be governed
-  by the implementation. For most implementations, this means "allow", but there
-  may be implementations which have their own policies outside of the standard
-  Kubernetes APIs.
+  lower precedenced ANPs, and instead be enforced by the K8s NetworkPolicies.
+  If there is no K8s NetworkPolicy rule match, and no BaselineAdminNetworkPolicy
+  rule match (more on this in the [priority section](#priority)), traffic will be
+  governed by the implementation. For most implementations, this means "allow",
+  but there may be implementations which have their own policies outside of the
+  standard Kubernetes APIs.
 - Deny: Traffic that matches a `Deny` rule will be dropped.
 - Allow: Traffic that matches an `Allow` rule will be allowed.
 
+AdminNetworkPolicy `Pass` rules allows an admin to delegate security posture for
+certain traffic to the Namespace owners by overriding any lower precedenced Allow
+or Deny rules. For example, intra-tenant traffic management can be delegated to
+tenant admins explicitly with the use of `Pass` rules.
+
 AdminNetworkPolicy `Deny` rules are useful for administrators to explicitly
-block traffic with malicious in-cluster clients, or workloads that pose security risks.
-Those traffic restrictions can only be lifted once the `Deny` rules are deleted,
-modified by the admin, or overridden by a higher priority rule.
+block traffic with malicious in-cluster clients, or workloads that pose security
+risks. Those traffic restrictions can only be lifted once the `Deny` rules are
+deleted, modified by the admin, or overridden by a higher priority rule.
 
 On the other hand, the `Allow` rules can be used to call out traffic in the cluster
 that needs to be allowed for certain components to work as expected (egress to
 CoreDNS for example). Those traffic should not be blocked when developers apply
 NetworkPolicy to their Namespaces which isolates the workloads.
 
-AdminNetworkPolicy `Pass` rules allows an admin to delegate security posture for
-certain traffic to the Namespace owners by overriding any lower precedence Allow
-or Deny rules. For example, intra-tenant traffic management can be delegated to tenant
-admins explicitly with the use of `Pass` rules.
-
 #### Priority
 
 The policy instances will be ordered based on the numeric priority assigned to each
 ANP. `Priority` is a 32 bit integer value, where a smaller number corresponds to
-a higher precedence. The lowest "regular" numeric value (see below for special cases)
-is "1", which corresponds to the highest precedence. Larger numbers have lower precedence.
-For alpha, this API defines "1000" as the maximum numeric value for priority, but
-this may be revisited as the proposal advances. For future-safety, clients may assume
-that higher values will eventually be allowed, and simply treat it as an int32.
-Any positive priority (non-zero) numbered policy will have higher precedence over the namespaced
-NetworkPolicy instances in the cluster, unless the traffic matches a higher-precedence
-`Pass` rule that allows it to bypass any lower-precedence ANP rules.
-
-```
-<<[UNRESOLVED ANP rules with Priority="0"]>>
-Additionally, the special priority "0" can be used in the priority field to indicate
-that the rules in that policy instance shall be created at a precedence lower than
-the Namespaced NetworkPolicies. Note that the `Pass` action does not skip ANPs created
-at priority "0". In other words, if some traffic hits a `Pass` rule and no K8s
-NetworkPolicy applies, the "0" priority ANPs will still be evaluated.
-<<[/UNRESOLVED]>>
-```
+a higher precedence. The lowest numeric priority value is "0", which corresponds
+to the highest precedence. Larger numbers have lower precedence.
+All ANPs will have higher precedence over NetworkPolicies in the cluster. If
+traffic matches both an ANP rule and a NetworkPolicy rule, the only case where the
+NetworkPolicy rule will be evaluated is when there is a third higher-precedence
+ANP `Pass` rule that allows it to bypass any lower-precedence ANP rules.
 
 The relative precedence of the rules within a single ANP object (all of which
 share a priority) will be determined by the order in which the rule is written.
 Thus, a rule that appears at the top of the ingress/egress rules would take the
-highest precedence. 
+highest precedence.
 
-```
-<<[UNRESOLVED How to calculate maximum rules in a single AdminNetworkPolicy]>>
-The maximum number of rules, which will be calculated as the
-total summation of the AdminNetworkPolicyIngressRules and AdminNetworkPolicyEgressRules
-in a single ANP instance, will be 100.
-<<[/UNRESOLVED]>>
-```
+For alpha, this API defines "1000" as the maximum numeric value for priority, but
+this may be revisited as the proposal advances. For future-safety, clients may assume
+that higher values will eventually be allowed, and simply treat it as an int32.
+Also for alpha, each ANP is limited to 100 ingress rules and 100 egress rules,
+which is subject to change (to a greater number) in the future as well.
 
 Conflict resolution: Two policies are considered to be conflicting if they are assigned
 the same `priority` and apply to the same resources or a union of resources. In order
@@ -249,7 +242,7 @@ how to handle overlapping priority situations.
 
 #### Rule Names
 
-In order to help future proof the ANP api, a built in mechanism to identify each
+In order to help future proof the ANP API, a built in mechanism to identify each
 allow/deny/pass rule is required. Such a mechanism will help administrators organize
 and identify individual rules within an AdminNetworkPolicy resource.
 We propose to introduce a new string field, called `name`, in each `AdminNetworkPolicy`
@@ -258,6 +251,23 @@ if it is unique within an ANP instance. The max length for the rule name
 string is restricted to 100 characters, which provides flexibility for long generated
 names.
 
+### BaselineAdminNetworkPolicy resource
+
+An BaselineAdminNetworkPolicy (BANP) resource will help the administrators set
+baseline security rules that describes default connectivity for cluster workloads,
+which CAN be overridden by developer owned NetworkPolicies if needed.
+
+The BaselineAdminNetworkPolicy spec will look almost identical to that of an ANP,
+except for two important differences:
+1. There is no `Priority` field associated with BaselineAdminNetworkPolicy.
+Note that in writing a BaselineAdminNetworkPolicy, admins can create different
+priorities in rules by placing them before or after one another. However, the
+authors of this KEP did not find a valid usecase for creating multiple
+BaselineAdminNetworkPolicies in a cluster with distinct policy-level priorities.
+BANPs are intended for setting cluster default security postures, and in most
+cases the subject of such policy should be the entire cluster.
+2. There is no `Pass` action for BaselineAdminNetworkPolicy rules.
+
 ### User Stories
 
 Note: This KEP will focus on East-West traffic, cluster internal, user stories and
@@ -336,7 +346,7 @@ to edit the AdminNetworkPolicy resources.
 
 ### Key differences between AdminNetworkPolicies and NetworkPolicies
 
-|                          | AdminNetworkPolicy                                                                                             | K8s NetworkPolicies                                                                |
+|                          | AdminNetworkPolicy                                                                                               | K8s NetworkPolicies                                                                |
 |--------------------------|------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------|
 | Target persona           | Cluster administrator or equivalent                                                                              | Developers within Namespaces                                                       |
 | Scope                    | Cluster                                                                                                          | Namespaced                                                                         |
@@ -408,231 +418,418 @@ also like to consider the following set of proposals as future work items:
 ### AdminNetworkPolicy API Design
 
 The following new `AdminNetworkPolicy` API will be added to the `policy.networking.k8s.io`
-API group.  
+API group:
 
 ```golang
 
 // AdminNetworkPolicy describes cluster-level network traffic control rules
 type AdminNetworkPolicy struct {
-  metav1.TypeMeta
-  metav1.ObjectMeta
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata"`
 
-  // Specification of the desired behavior of AdminNetworkPolicy.
-  Spec     AdminNetworkPolicySpec
+	// Specification of the desired behavior of AdminNetworkPolicy.
+	Spec AdminNetworkPolicySpec `json:"spec"`
 
-  // ANPStatus is the status to be reported by the implementation, this is not
+  // Status is the status to be reported by the implementation, this is not
   // standardized in alpha and consumers should report what they see fit in
-  // relation to their AdminNetworkPolicy implementation
+  // relation to their AdminNetworkPolicy implementation.
   // +optional
-  Status  AdminNetworkPolicyStatus
+	Status AdminNetworkPolicyStatus `json:"status,omitempty"`
 }
 
 type AdminNetworkPolicyStatus struct {
   Conditions   []metav1.Condition
 }
 
-// AdminNetworkPolicySpec provides the specification of AdminNetworkPolicy
+// AdminNetworkPolicySpec defines the desired state of AdminNetworkPolicy.
 type AdminNetworkPolicySpec struct {
-  // Priority is an int32 value bound to 0 - 1000, the lowest positive priority,
-  // "1" corresponds to the highest importance, while higher priorities have
-  // lower importance. An ANP with a priority of "0" will be evaluated after all
-  // positive priority AdminNetworkPolicies and standard NetworkPolicies.
-  // The Priority for an ANP must be set
-  Priority    *int32
-
-  // Subject defines the objects to which this AdminNetworkPolicy applies.  
-  Subject     AdminNetworkPolicySubject
-
-  // List of Ingress rules to be applied to the selected objects.
-  // A total of 100 rules will be allowed per each network policy instance,
-  // this rule count will be calculated as the total summation of the
-  // Ingress and Egress rules in a single AdminNetworkPolicy Instance.
-  Ingress     []AdminNetworkPolicyIngressRule
-
-  // List of Egress rules to be applied to the selected objects.
-  // A total of 100 rules will be allowed per each network policy instance,
-  // this rule count will be calculated as the total summation of the
-  // Ingress and Egress rules in a single AdminNetworkPolicy Instance.
-  Egress      []AdminNetworkPolicyEgressRule
-}
-
-// AdminNetworkPolicySubject defines what objects the policy selects.
-// Exactly one of the `NamespaceSelector` or `NamespaceAndPodSelector` pointers 
-// should be set.
-type AdminNetworkPolicySubject struct {
-  NamespaceSelector         *metav1.LabelSelector
-  NamespaceAndPodSelector   *NamespacedPodSubject
-}
-
-// NamespacedPodSubject allows the user to select a given set of pod(s) in
-// selected namespace(s)
-type NamespacedPodSubject struct {
-  // This field follows standard label selector semantics; if present but empty, 
-  // it selects all Namespaces.  
-  NamespaceSelector  *metav1.LabelSelector
-
-  // Used to explicitly select pods within a namespace; if present but empty, 
-  // it selects all Pods.  
-  PodSelector        *metav1.LabelSelector
+	// Priority is a value from 0 to 1000. Rules with lower priority values have
+	// higher precedence, and are checked before rules with higher priority values.
+	// All AdminNetworkPolicy rules have higher precedence than NetworkPolicy or
+	// BaselineAdminNetworkPolicy rules
+	// The relative precedence of the rules within a single ANP object (all of
+	// which share the priority) will be determined by the order in which the rule
+	// is written. Thus, a rule that appears at the top of the ingress/egress rules
+	// would take the highest precedence. If ingress rules are defined before egress
+	// rules in the same ANP object then ingress would take precedence and vice versa.
+	// The behavior is undefined if two ANP objects have same priority.
+	// +kubebuilder:validation:Minimum=0
+	// +kubebuilder:validation:Maximum=1000
+	Priority int32 `json:"priority"`
+
+	// Subject defines the pods to which this AdminNetworkPolicy applies.
+	Subject AdminNetworkPolicySubject `json:"subject"`
+
+	// Ingress is the list of Ingress rules to be applied to the selected pods.
+	// A total of 100 rules will be allowed in each ANP instance. ANPs with no
+	// ingress rules do not affect ingress traffic.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Ingress []AdminNetworkPolicyIngressRule `json:"ingress,omitempty"`
+
+	// Egress is the list of Egress rules to be applied to the selected pods.
+	// A total of 100 rules will be allowed in each ANP instance. ANPs with no
+	// egress rules do not affect egress traffic.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Egress []AdminNetworkPolicyEgressRule `json:"egress,omitempty"`
 }
 
 // AdminNetworkPolicyIngressRule describes an action to take on a particular
 // set of traffic destined for pods selected by an AdminNetworkPolicy's
-// Subject field. The traffic must match both ports and from.
+// Subject field.
 type AdminNetworkPolicyIngressRule struct {
-  // Name is an identifier for this rule, that should be no more than 100 characters 
-  // in length.
-  // +optional
-  Name         string
-
-  // Action specifies whether this rule must pass, allow or deny traffic.
-  // Allow: allows the selected traffic
-  // Deny: denies the selected traffic
-  // Pass: allows the selected traffic to skip and remaining positive priority (non-zero) 
-  // ANP rules and be delegated by K8's Network Policy.
-  Action       AdminNetPolRuleAction
-
-  // Ports allows for matching on traffic based on port and protocols.
-  Ports        AdminNetworkPolicyPorts
-
-  // List of sources from which traffic will be allowed/denied/passed to the entities
-  // selected by this AdminNetworkPolicyRule. Items in this list are combined using a logical OR
-  // operation. If this field is empty, this rule matches no sources.
-  // If this field is present and contains at least one item, this rule
-  // allows/denies/passes traffic from the defined AdminNetworkPolicyPeer(s)
-  From         []AdminNetworkPolicyPeer
+	// Name is an identifier for this rule, that may be no more than 100 characters
+	// in length. This field should be used by the implementation to help
+	// improve observability, readability and error-reporting for any applied
+	// AdminNetworkPolicies.
+	// +optional
+	// +kubebuilder:validation:MaxLength=100
+	Name string `json:"name,omitempty"`
+
+	// Action specifies the effect this rule will have on matching traffic.
+	// Currently the following actions are supported:
+	// Allow: allows the selected traffic (even if it would otherwise have been denied by NetworkPolicy)
+	// Deny: denies the selected traffic
+	// Pass: instructs the selected traffic to skip any remaining ANP rules, and
+	// then pass execution to any NetworkPolicies that select the pod.
+	// If the pod is not selected by any NetworkPolicies then execution
+	// is passed to any BaselineAdminNetworkPolicies that select the pod.
+	Action AdminNetworkPolicyRuleAction `json:"action"`
+
+	// From is the list of sources whose traffic this rule applies to.
+	// If any AdminNetworkPolicyPeer matches the source of incoming
+	// traffic then the specified action is applied.
+	// This field must be defined and contain at least one item.
+	// +kubebuilder:validation:MinItems=1
+	// +kubebuilder:validation:MaxItems=100
+	From []AdminNetworkPolicyPeer `json:"from"`
+
+	// Ports allows for matching traffic based on port and protocols.
+	// If Ports is not set then the rule does not filter traffic via port.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Ports *[]AdminNetworkPolicyPort `json:"ports,omitempty"`
 }
 
 // AdminNetworkPolicyEgressRule describes an action to take on a particular
 // set of traffic originating from pods selected by a AdminNetworkPolicy's
-// Subject field. The traffic must match both ports and to.
+// Subject field.
 type AdminNetworkPolicyEgressRule struct {
-  // Name is an identifier for this rule, that should be no more than 100 characters 
-  // in length.
-  // +optional
-  Name         string
-
-  // Action specifies whether this rule must pass, allow or deny traffic.
-  // Allow: allows the selected traffic
-  // Deny: denies the selected traffic
-  // Pass: allows the selected traffic to skip and remaining positive priority (non-zero)
-  // ANP rules and be delegated by K8's Network Policy.
-  Action       AdminNetPolRuleAction
-
-  // Ports allows for matching on traffic based on port and protocols.
-  Ports        AdminNetworkPolicyPorts
-
-  // List of destinations to which traffic will be allowed/denied/passed from the entities
-  // selected by this AdminNetworkPolicyRule. Items in this list are combined using a logical OR
-  // operation. If this field is empty, this rule matches no destinations.
-  // If this field is present and contains at least one item, this rule
-  // allows/denies/passes traffic to the defined AdminNetworkPolicyPeer(s)
-  To           []AdminNetworkPolicyPeer
+	// Name is an identifier for this rule, that may be no more than 100 characters
+	// in length. This field should be used by the implementation to help
+	// improve observability, readability and error-reporting for any applied
+	// AdminNetworkPolicies.
+	// +optional
+	// +kubebuilder:validation:MaxLength=100
+	Name string `json:"name,omitempty"`
+
+	// Action specifies the effect this rule will have on matching traffic.
+	// Currently the following actions are supported:
+	// Allow: allows the selected traffic (even if it would otherwise have been denied by NetworkPolicy)
+	// Deny: denies the selected traffic
+	// Pass: instructs the selected traffic to skip any remaining ANP rules, and
+	// then pass execution to any NetworkPolicies that select the pod.
+	// If the pod is not selected by any NetworkPolicies then execution
+	// is passed to any BaselineAdminNetworkPolicies that select the pod.
+	Action AdminNetworkPolicyRuleAction `json:"action"`
+
+	// To is the List of destinations whose traffic this rule applies to.
+	// If any AdminNetworkPolicyPeer matches the destination of outgoing
+	// traffic then the specified action is applied.
+	// This field must be defined and contain at least one item.
+	// +kubebuilder:validation:MinItems=1
+	// +kubebuilder:validation:MaxItems=100
+	To []AdminNetworkPolicyPeer `json:"to"`
+
+	// Ports allows for matching traffic based on port and protocols.
+	// If Ports is not set then the rule does not filter traffic via port.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Ports *[]AdminNetworkPolicyPort `json:"ports,omitempty"`
 }
 
-// AdminNetworkPolicyPorts handles selection of traffic for based on L4
-// constructs. Exactly one of the fields must be defined.
-type AdminNetworkPolicyPorts struct {
-  // AllPorts cannot be "false" when it is set
-  // AllPorts allows the user to select all ports for all protocols, thus not
-  // selecting traffic based on L4 principles.
-  // If "true" then all ports are selected for the all protocols.
-  AllPorts   *bool
-
-  // The list of ports to allow/deny/pass traffic on, each item in this list is 
-  // combined using a logical OR. When this field is present it should contain at 
-  // least one item, and this rule allows/denies/passes traffic only if the traffic 
-  // matches at least one port in the list.
-  // +optional
-  List      []AdminNetworkPolicyPort
+
+const (
+	// AdminNetworkPolicyRuleActionAllow indicates that matching traffic will be
+	// allowed regardless of NetworkPolicy and BaselineAdminNetworkPolicy
+	// rules. Users cannot block traffic which has been matched by an "Allow"
+	// rule in an AdminNetworkPolicy.
+	AdminNetworkPolicyRuleActionAllow AdminNetworkPolicyRuleAction = "Allow"
+	// AdminNetworkPolicyRuleActionDeny indicates that matching traffic will be
+	// denied before being checked against NetworkPolicy or
+	// BaselineAdminNetworkPolicy rules. Pods will never receive traffic which
+	// has been matched by a "Deny" rule in an AdminNetworkPolicy.
+	AdminNetworkPolicyRuleActionDeny AdminNetworkPolicyRuleAction = "Deny"
+	// AdminNetworkPolicyRuleActionPass indicates that matching traffic will
+	// bypass further AdminNetworkPolicy processing (ignoring rules with lower
+	// precedence) and be allowed or denied based on NetworkPolicy and
+	// BaselineAdminNetworkPolicy rules.
+	AdminNetworkPolicyRuleActionPass AdminNetworkPolicyRuleAction = "Pass"
+)
+```
+
+The following new `BaslineAdminNetworkPolicy` API will also be added to the `policy.networking.k8s.io`
+API group:
+
+```golang
+
+type BaselineAdminNetworkPolicy struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata"`
+
+	// Specification of the desired behavior of BaselineAdminNetworkPolicy.
+	Spec BaselineAdminNetworkPolicySpec `json:"spec"`
+
+	// Status is the status to be reported by the implementation.
+	// +optional
+	Status BaselineAdminNetworkPolicyStatus `json:"status,omitempty"`
 }
 
+// BaselineAdminNetworkPolicyStatus defines the observed state of
+// BaselineAdminNetworkPolicy.
+type BaselineAdminNetworkPolicyStatus struct {
+	Conditions []metav1.Condition `json:"conditions"`
+}
 
-// AdminNetworkPolicyPort describes a port to select
-type AdminNetworkPolicyPort struct {
-  // The protocol (TCP, UDP, or SCTP) which traffic must match. If not specified, this
-  // field defaults to TCP.
-  // +optional
-  Protocol   *v1.Protocol
+// BaselineAdminNetworkPolicySpec defines the desired state of
+// BaselineAdminNetworkPolicy.
+type BaselineAdminNetworkPolicySpec struct {
+	// Subject defines the pods to which this BaselineAdminNetworkPolicy applies.
+	Subject AdminNetworkPolicySubject `json:"subject"`
+
+	// Ingress is the list of Ingress rules to be applied to the selected pods
+	// if they are not matched by any AdminNetworkPolicy or NetworkPolicy rules.
+	// A total of 100 Ingress rules will be allowed in each BANP instance.
+	// BANPs with no ingress rules do not affect ingress traffic.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Ingress []BaselineAdminNetworkPolicyIngressRule `json:"ingress,omitempty"`
+
+	// Egress is the list of Egress rules to be applied to the selected pods if
+	// they are not matched by any AdminNetworkPolicy or NetworkPolicy rules.
+	// A total of 100 Egress rules will be allowed in each BANP instance. BANPs
+	// with no egress rules do not affect egress traffic.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Egress []BaselineAdminNetworkPolicyEgressRule `json:"egress,omitempty"`
+}
 
-  // The port on the given protocol. This can either be a numerical or named
-  // port on a pod. If this field is not provided, this matches no port names and
-  // numbers.
-  // If present, only traffic on the specified protocol AND port will be matched.
-  // +optional
-  Port       *intstr.IntOrString
+// BaselineAdminNetworkPolicyIngressRule describes an action to take on a particular
+// set of traffic destined for pods selected by a BaselineAdminNetworkPolicy's
+// Subject field.
+type BaselineAdminNetworkPolicyIngressRule struct {
+	// Name is an identifier for this rule, that may be no more than 100 characters
+	// in length. This field should be used by the implementation to help
+	// improve observability, readability and error-reporting for any applied
+	// BaselineAdminNetworkPolicies.
+	// +optional
+	// +kubebuilder:validation:MaxLength=100
+	Name string `json:"name,omitempty"`
+
+	// Action specifies the effect this rule will have on matching traffic.
+	// Currently the following actions are supported:
+	// Allow: allows the selected traffic
+	// Deny: denies the selected traffic
+	Action BaselineAdminNetworkPolicyRuleAction `json:"action"`
+
+	// From is the list of sources whose traffic this rule applies to.
+	// If any AdminNetworkPolicyPeer matches the source of incoming
+	// traffic then the specified action is applied.
+	// This field must be defined and contain at least one item.
+	// +kubebuilder:validation:MinItems=1
+	From []AdminNetworkPolicyPeer `json:"from"`
+
+	// Ports allows for matching traffic based on port and protocols.
+	// If Ports is not set then the rule does not filter traffic via port.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Ports *[]AdminNetworkPolicyPort `json:"ports,omitempty"`
+}
 
-  // If set, indicates that the range of ports from port to endPort, inclusive,
-  // should be allowed by the policy. This field cannot be defined if the port field
-  // is not defined or if the port field is defined as a named (string) port.
-  // The endPort must be equal or greater than port.
-  // +optional
-  EndPort     *int32
+// AdminNetworkPolicyEgressRule describes an action to take on a particular
+// set of traffic originating from pods selected by a BaselineAdminNetworkPolicy's
+// Subject field.
+type BaselineAdminNetworkPolicyEgressRule struct {
+	// Name is an identifier for this rule, that may be no more than 100 characters
+	// in length. This field should be used by the implementation to help
+	// improve observability, readability and error-reporting for any applied
+	// BaselineAdminNetworkPolicies.
+	// +optional
+	// +kubebuilder:validation:MaxLength=100
+	Name string `json:"name,omitempty"`
+
+	// Action specifies the effect this rule will have on matching traffic.
+	// Currently the following actions are supported:
+	// Allow: allows the selected traffic
+	// Deny: denies the selected traffic
+	Action BaselineAdminNetworkPolicyRuleAction `json:"action"`
+
+	// To is the list of destinations whose traffic this rule applies to.
+	// If any AdminNetworkPolicyPeer matches the destination of outgoing
+	// traffic then the specified action is applied.
+	// This field must be defined and contain at least one item.
+	// +kubebuilder:validation:MinItems=1
+	To []AdminNetworkPolicyPeer `json:"to"`
+
+	// Ports allows for matching traffic based on port and protocols.
+	// If Ports is not set then the rule does not filter traffic via port.
+	// +optional
+	// +kubebuilder:validation:MaxItems=100
+	Ports *[]AdminNetworkPolicyPort `json:"ports,omitempty"`
 }
 
+// BaselineAdminNetworkPolicyRuleAction string describes the BaselineAdminNetworkPolicy
+// action type.
+// +enum
+type BaselineAdminNetworkPolicyRuleAction string
+
 const (
-  // RuleActionPass enables admins to provide exceptions to ClusterNetworkPolicies and delegate this rule to
-  // K8s NetworkPolicies.
-  AdminNetPolRuleActionPass    AdminNetPolRuleAction = "Pass"
-  // RuleActionDeny enables admins to deny specific traffic.
-  AdminNetPolRuleActionDeny    AdminNetPolRuleAction = "Deny"
-  // RuleActionAllow enables admins to specifically allow certain traffic.
-  AdminNetPolRuleActionAllow   AdminNetPolRuleAction = "Allow"
+	// BaselineAdminNetworkPolicyRuleActionDeny enables admins to deny traffic.
+	BaselineAdminNetworkPolicyRuleActionDeny BaselineAdminNetworkPolicyRuleAction = "Deny"
+	// BaselineAdminNetworkPolicyRuleActionAllow enables admins to allow certain traffic.
+	BaselineAdminNetworkPolicyRuleActionAllow BaselineAdminNetworkPolicyRuleAction = "Allow"
 )
+```
+
+The following types are common to the `AdminNetworkPolicy` and `BaselineAdminNetworkPolicy`
+resources:
+
+```golang
+
+// AdminNetworkPolicySubject defines what resources the policy applies to.
+// Exactly one field must be set.
+// +kubebuilder:validation:MaxProperties=1
+// +kubebuilder:validation:MinProperties=1
+type AdminNetworkPolicySubject struct {
+	// Namespaces is used to select pods via namespace selectors.
+	// +optional
+	Namespaces *metav1.LabelSelector `json:"namespaces,omitempty"`
+	// Pods is used to select pods via namespace AND pod selectors.
+	// +optional
+	Pods *NamespacedPodSubject `json:"pods,omitempty"`
+}
+
+// NamespacedPodSubject allows the user to select a given set of pod(s) in
+// selected namespace(s)
+type NamespacedPodSubject struct {
+  // This field follows standard label selector semantics; if empty,
+  // it selects all Namespaces.  
+  NamespaceSelector  metav1.LabelSelector `json:"namespaceSelector"`
+
+  // Used to explicitly select pods within a namespace; if empty,
+  // it selects all Pods.  
+  PodSelector        metav1.LabelSelector `json:"podSelector"`
+}
+
+// AdminNetworkPolicyPort describes how to select network ports on pod(s).
+// Exactly one field must be set.
+// +kubebuilder:validation:MaxProperties=1
+// +kubebuilder:validation:MinProperties=1
+type AdminNetworkPolicyPort struct {
+	// Port selects a port on a pod(s) based on number.
+	// +optional
+	PortNumber *Port `json:"portNumber,omitempty"`
+
+	// NamedPort selects a port on a pod(s) based on name.
+	// +optional
+	NamedPort *string `json:"namedPort,omitempty"`
+
+	// PortRange selects a port range on a pod(s) based on provided start and end
+	// values.
+	// +optional
+	PortRange *PortRange `json:"portRange,omitempty"`
+}
+
+type Port struct {
+	// Protocol is the network protocol (TCP, UDP, or SCTP) which traffic must
+	// match. If not specified, this field defaults to TCP.
+	Protocol v1.Protocol `json:"protocol"`
+
+	// Number defines a network port value.
+	Port int32 `json:"port"`
+}
+
+// PortRange defines an inclusive range of ports from the the assigned Start value
+// to End value.
+type PortRange struct {
+	// Protocol is the network protocol (TCP, UDP, or SCTP) which traffic must
+	// match. If not specified, this field defaults to TCP.
+	Protocol v1.Protocol `json:"protocol,omitempty"`
+
+	// Start defines a network port that is the start of a port range, the Start
+	// value must be less than End.
+	Start int32 `json:"start"`
+
+	// End defines a network port that is the end of a port range, the End value
+	// must be greater than Start.
+	End int32 `json:"end"`
+}
 
 // AdminNetworkPolicyPeer defines an in-cluster peer to allow traffic to/from.
 // Exactly one of the selector pointers should be set for a given peer.
 type AdminNetworkPolicyPeer struct {
-  Namespaces       *NamespaceSet
-  NamespacedPods   *NamespaceAndPodSet
+	// Namespaces defines a way to select a set of Namespaces.
+	// +optional
+	Namespaces *NamespacedPeer `json:"namespaces,omitempty"`
+	// Pods defines a way to select a set of pods in
+	// in a set of namespaces.
+	// +optional
+	Pods *NamespacedPodPeer `json:"pods,omitempty"`
 }
 
-// NamespaceSet defines a flexible way to select Namespaces in a cluster.
-// Exactly one of the selectors should be set.  If a consumer observes none of
-// its fields are set, they should assume an option they are not aware of has
-// been specified and fail closed.
-type NamespaceSet struct {
-  // Self cannot be "false" when it is set.  
-  // If Self is "true" then all pods in the subject's namespace are selected.
-  Self                *bool
-  // NotSelf cannot be "false" when it is set.  
-  // if NotSelf is "true" then all pods not in the subject's Namespace are selected.
-  NotSelf             *bool
-  // NamespaceSelector is a labelSelector used to select Namespaces, This field
-  // follows standard label selector semantics; if present but empty, it selects
-  // all Namespaces.
-  NamespaceSelector            *metav1.LabelSelector
+type NamespaceRelation string
+
+const (
+	NamespaceSelf    NamespaceRelation = "Self"
+	NamespaceNotSelf NamespaceRelation = "NotSelf"
+)
+
+// NamespacedPeer defines a flexible way to select Namespaces in a cluster.
+// Exactly one of the selectors must be set.  If a consumer observes none of
+// its fields are set, they must assume an unknown option has been specified
+// and fail closed.
+// +kubebuilder:validation:MaxProperties=1
+// +kubebuilder:validation:MinProperties=1
+type NamespacedPeer struct {
+	// Related provides a mechanism for selecting namespaces relative to the
+	// subject pod. A value of "Self" matches the subject pod's namespace,
+	// while a value of "NotSelf" matches namespaces other than the subject
+	// pod's namespace.
+	// +optional
+	Related *NamespaceRelation `json:"related,omitempty"`
+
+	// NamespaceSelector is a labelSelector used to select Namespaces, This field
+	// follows standard label selector semantics; if present but empty, it selects
+	// all Namespaces.
+	// +optional
+	NamespaceSelector *metav1.LabelSelector `json:"namespaceSelector,omitempty"`
+
   // SameLabels is used to select a set of Namespaces that share the same values
   // for a set of labels.
-  // To be selected a Namespace must have all of the labels defined in SameLabels,
-  // and they must all have the same value as the subject of this policy.
-  // If Samelabels is Empty then nothing is selected.
-  SameLabels          []string
-  // NotSameLabels is used to select a set of Namespaces that do not have a set
-  // of label(s). To be selected a Namespace must have none of the labels defined
-  // in NotSameLabels. If NotSameLabels is empty then nothing is selected.
-  NotSameLabels       []string
-}
+	// To be selected a Namespace must have all of the labels defined in SameLabels,
+	// and they must all have the same value as the subject of this policy.
+	// If Samelabels is Empty then nothing is selected.
+  // +optional
+	SameLabels []string `json:"sameLabels,omitempty"`
 
-// PodSet defines a flexible way to select pods in a cluster. Exactly one of the
-// selectors should be set.  If a consumer observes none of its fields are set,
-// they should assume an option they are not aware of has been specified and fail closed.
-type PodSet struct {
-  // PodSelector is a labelSelector used to select Pods, This field
-  // follows standard label selector semantics; if present but empty, it selects
-  // all Pods.  
-  PodSelector     *metav1.LabelSelector
+	// NotSameLabels is used to select a set of Namespaces that do not have a set
+	// of label(s). To be selected a Namespace must have none of the labels defined
+	// in NotSameLabels. If NotSameLabels is empty then nothing is selected.
+	// +optional
+	NotSameLabels []string `json:"notSameLabels,omitempty"`
 }
 
-// NamespaceSetAndPod defines a flexible way to select Namespaces and pods in a
-// cluster. The `Namespaces` and `Pods` fields are required and must not be empty.
-type NamespaceAndPodSet struct {
-  // Namespaces is used to select a set of Namespaces.  It must be defined and 
-  // non-empty. 
-  Namespaces   NamespaceSet
-  // Namespaces is used to select a set of Pods in the set of Namespaces. It must
-  // must be defined and non-empty. 
-  Pods         PodSet
+// NamespacedPodPeer defines a flexible way to select Namespaces and pods in a
+// cluster. The `Namespaces` and `PodSelector` fields are required.
+type NamespacedPodPeer struct {
+	// Namespaces is used to select a set of Namespaces.
+	Namespaces NamespacedPeer `json:"namespaces"`
+
+	// PodSelector is a labelSelector used to select Pods, This field is NOT optional,
+	// follows standard label selector semantics and if present but empty, it selects
+	// all Pods.
+	PodSelector metav1.LabelSelector `json:"podSelector"`
 }
 ```
 
@@ -649,19 +846,19 @@ traffic should be allowed/denied/passed from/to the AdminNetworkPolicyPeer. This
 - The `AdminNetworkPolicySubject` and `AdminNetworkPolicyPeer` types are explicitly
 designed to allow for future extensibility with a focus on the addition of new types
 of selectors. Specifically it will allow for failing closed in the event an implementation
-does not implement a defined selector. For example, If a new type (`NamespaceSetAndService`)
+does not implement a defined selector. For example, If a new type (`ServiceAccounts`)
 was added to the `AdminNetworkPolicyPeer` struct, and an implementation had not
 yet implemented support for such a selector, an ANP using the new selector would
 have no effect since the implementation would simply see an empty `AdminNetworkPolicyPeer`
 object.
 
-#### Further examples utilizing the self field for `NamespaceSet` objects
+#### Further examples utilizing the self field for `NamespacedPeer` objects
 
 __Self:__
 This is a special strategy to indicate that the rule only applies to the Namespace for
 which the ingress/egress rule is currently being evaluated upon. Since the Pods
 selected by the AdminNetworkPolicy `subject` could be from multiple Namespaces,
-the scope of ingress/egress rules whose `self=true` will be the Pod's
+the scope of ingress/egress rules whose `namespaces.related=self` will be the Pod's
 own Namespace for each selected Pod.
 Consider the following example:
 
@@ -678,15 +875,12 @@ spec:
   ingress:
   - action: Allow
     from:
-    - namespacedPods:
+    - pods:
         namespaces:
-          self: true
-        pods:
-          podSelector:
-            matchLabels:
-              app: b
-    ports:
-      allPorts: true
+          related: self
+        podSelector:
+          matchLabels:
+            app: b
 ```
 
 The above AdminNetworkPolicy should be interpreted as: for each Namespace in
@@ -697,7 +891,7 @@ x/b1 -> x/a1 and y/b2 -> y/a2, but does not allow y/b2 -> x/a1 and x/b1 -> y/a2.
 __SameLabels:__
 This is a special strategy to indicate that the rule only applies to the Namespaces
 which share the same label value. Since the Pods selected by the AdminNetworkPolicy `subject`
-could be from multiple Namespaces, the scope of ingress/egress rules whose `scope=samelabels; labels: [tenant]`
+could be from multiple Namespaces, the scope of ingress/egress rules whose `namespaces.samelabels=tenant`
 will be all the Pods from the Namespaces who have the same label value for the "tenant" key.
 Consider the following example:
 
@@ -720,8 +914,6 @@ spec:
       - namespaces:
           sameLabels:
           - tenant
-      ports:
-        allPorts: true
 ```
 
 The above AdminNetworkPolicy should be interpreted as: for each Namespace in
@@ -746,16 +938,14 @@ metadata:
 spec:
   priority: 10
   subject:
-    namespaceSelector:
+    namespaces:
       matchLabels:
         kubernetes.io/metadata.name: sensitive-ns
   ingress:
-    - action: Deny
-      from:
-      - namespaces:
-         namespaceSelector: {}
-      ports:
-        allPorts: true
+  - action: Deny
+    from:
+    - namespaces:
+        namespaceSelector: {}
 ```
 
 #### Sample spec for Story 2: Allow traffic at a cluster level
@@ -768,31 +958,25 @@ metadata:
 spec:
   priority: 30
   subject:
-    namespaceSelector: {}
+    namespaces: {}
   ingress:
-    - action: Allow
-      from:
-      - namespaces:
-          namespaceSelector:
-            matchLabels:
-              kubernetes.io/metadata.name: monitoring-ns
-      ports:
-        allPorts: true
+  - action: Allow
+    from:
+    - namespaces:
+        namespaceSelector:
+          matchLabels:
+            kubernetes.io/metadata.name: monitoring-ns
   egress:
-    - action: Allow
-      to:
-      - namespacedPods:
-          namespaces:
-            namespaceSelector:
-              matchlabels:
-                kubernetes.io/metadata.name: kube-system
-          pods:   
-            podSelector:
-              matchlabels:
-                app: kube-dns
-      ports:
-        allPorts: true
-
+  - action: Allow
+    to:
+    - pods:
+        namespaces:
+          namespaceSelector:
+            matchlabels:
+              kubernetes.io/metadata.name: kube-system
+        podSelector:   
+          matchlabels:
+            app: kube-dns
 ```
 
 #### Sample spec for Story 3: Explicitly Delegate traffic to existing K8s Network Policy
@@ -805,23 +989,22 @@ metadata:
 spec:
   priority: 20
   subject:
-    namespaceSelector: {}
+    namespaces: {}
   egress:
   - action: Pass
     to:
-    - namespacedPods:
+    - pods:
         namespaces:
           namespaceSelector:
             matchLabels:
               kubernetes.io/metadata.name: bar-ns-1
-        pods:
-          podSelector:
-            matchLabels:
-              app: svc-pub
+        podSelector:
+          matchLabels:
+            app: svc-pub
     ports:
-      list:
+      port:
        - protocol: TCP
-         port: 8080
+         number: 8080
 ```
 
 #### Sample spec for Story 4: Create and Isolate multiple tenants in a cluster
@@ -834,16 +1017,14 @@ metadata:
 spec:
   priority: 50
   subject:
-    namespaceSelector:
+    namespaces:
       matchExpressions: {key: "tenant"; operator: Exists}
   ingress:
-    - action: Deny
-      from:
-      - namespaces:
-          notSameLabels:
+  - action: Deny
+    from:
+    - namespaces:
+        notSameLabels:
           - tenant
-      ports:
-        allPorts: true
 ```
 
 Note: the above AdminNetworkPolicy can also be written in the following fashion:
@@ -855,22 +1036,18 @@ metadata:
 spec:
   priority: 50
   subject:
-    namespaceSelector:
+    namespaces:
       matchExpressions: {key: "tenant"; operator: Exists}
   ingress:
-    - action: Pass
-      from:
-      - namespaces:
-          sameLabels:
-          - tenant
-      ports:
-        allPorts: true
-    - action: Deny   # Deny everything else other than same tenant traffic
-      from:
-      - namespaces:
-          namespaceSelector: {}
-      ports:
-        allPorts: true
+  - action: Pass
+    from:
+    - namespaces:
+        sameLabels:
+        - tenant
+  - action: Deny   # Deny everything else other than same tenant traffic
+    from:
+    - namespaces:
+        namespaceSelector: {}
 ```
 The difference is that in the first case, traffic within tenant Namespaces will fall
 through, and be evaluated against lower-priority ClusterNetworkPolicies, and then
@@ -883,20 +1060,23 @@ specifies intra-tenant traffic must be delegated to the tenant Namespace owners.
 
 ```yaml
 apiVersion: policy.networking.k8s.io/v1alpha1
-kind: AdminNetworkPolicy
+kind: BaselineAdminNetworkPolicy
 metadata:
   name: baseline-rule-example
 spec:
-  priority: 0
   subject:
-    namespaceSelector: {}
+    namespaces: {}
   ingress:
-    - action: Deny   # zero-trust cluster default security posture
-      from:
-      - namespaces:
-          namespaceSelector: {}
-      ports:
-        allPorts: true
+  - action: Deny   # zero-trust cluster default security posture
+    from:
+    - namespaces:
+        namespaceSelector: {}
+  egress:
+  - action: Deny
+    to:
+    - namespaces:
+        namespaceSeletor: {}
+
 ```
 
 ### Test Plan
@@ -1046,7 +1226,7 @@ For `alpha` there will be no feature gate so this is N/A.
 
 For `alpha` there will be no feature gate so this is N/A.
 
-###### Are there any tests for feature enablement/disablement? 
+###### Are there any tests for feature enablement/disablement?
 
 <!--
 The e2e framework does not currently support enabling or disabling feature
@@ -1135,7 +1315,7 @@ Please describe all items visible to end users below with sufficient detail so t
 and operation of this feature.
 Recall that end users cannot usually observe component logs or access metrics.
 -->
- 
+
 - [x] API .status
   - Condition name: The Condition name will not be standardized in `alpha` however
   implementations are given the `status` field to report what they see fit.
@@ -1175,8 +1355,8 @@ Describe the metrics themselves and the reasons why they weren't added (e.g., co
 implementation difficulties, etc.).
 -->
 
-A metric describing the time it takes for the implementation to program the rules 
-defined in an AdminNetworkPolicy could be useful.  However, some implementations 
+A metric describing the time it takes for the implementation to program the rules
+defined in an AdminNetworkPolicy could be useful.  However, some implementations
 may struggle to report such a metric.
 
 ### Dependencies
@@ -1242,10 +1422,10 @@ Describe them, providing:
   - Supported number of objects per namespace (for namespace-scoped objects)
 -->
 
-- API Type: AdminNetworkPolicy 
-- Supported number of objects per cluster: The total number of AdminNetworkPolicies 
-will not be limited. However, it is important to remember that the only users 
-creating ANPs will be Cluster-Admins, of which there should only be a handful. This 
+- API Type: AdminNetworkPolicy
+- Supported number of objects per cluster: The total number of AdminNetworkPolicies
+will not be limited. However, it is important to remember that the only users
+creating ANPs will be Cluster-Admins, of which there should only be a handful. This
 will help limit the total number ANPs being deployed at any given time.
 
 ###### Will enabling / using this feature result in any new calls to the cloud provider?
@@ -1256,7 +1436,7 @@ Describe them, providing:
   - Estimated increase:
 -->
 
-This depends on the implementation, specifically based on the API used to program 
+This depends on the implementation, specifically based on the API used to program
 the AdminNetworkPolicy rules into the data-plane.
 
 ###### Will enabling / using this feature result in increasing size or count of the existing API objects?
@@ -1295,7 +1475,7 @@ This through this both in small and large cases, again with respect to the
 [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
 -->
 
-Not in any in-tree components, resource efficiency will need to be monitored by the 
+Not in any in-tree components, resource efficiency will need to be monitored by the
 implementation.
 
 ### Troubleshooting
diff --git a/keps/sig-network/2681-pod-host-ip/kep.yaml b/keps/sig-network/2681-pod-host-ip/kep.yaml
index e08ac84cafc7..9de5d23944fb 100644
--- a/keps/sig-network/2681-pod-host-ip/kep.yaml
+++ b/keps/sig-network/2681-pod-host-ip/kep.yaml
@@ -25,13 +25,13 @@ stage: alpha
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.24"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
-  alpha: "v1.24"
-  beta: "v1.25"
-  stable: "v1.27"
+  alpha: "v1.27"
+  beta: "v1.28"
+  stable: "v1.30"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
diff --git a/keps/sig-network/3458-remove-transient-node-predicates-from-service-controller/README.md b/keps/sig-network/3458-remove-transient-node-predicates-from-service-controller/README.md
new file mode 100644
index 000000000000..529cc8c90fbb
--- /dev/null
+++ b/keps/sig-network/3458-remove-transient-node-predicates-from-service-controller/README.md
@@ -0,0 +1,381 @@
+# KEP-3458: Remove transient node predicates from KCCM's service controller
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [Risks and Mitigations](#risks-and-mitigations)
+    - [Risk](#risk)
+    - [Mitigations](#mitigations)
+- [Design Details](#design-details)
+  - [Test Plan](#test-plan)
+    - [Prerequisite testing updates](#prerequisite-testing-updates)
+    - [Unit tests](#unit-tests)
+    - [Integration tests](#integration-tests)
+    - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Beta](#beta)
+    - [GA](#ga)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+The service controller in the Kubernetes cloud controller manager (KCCM)
+currently adds/removes Nodes from the load balancers' node set in the following
+cases:
+
+a) When a node gets the taint `ToBeDeletedByClusterAutoscaler` added/removed
+b) When a node goes `Ready` / `NotReady`
+
+b) however only applies to services with `externalTrafficPolicy: Cluster`. In
+both cases: removing the Node in question from the load balancers' node set will
+cause all connections on that node to get terminated instantly. This can be
+considered a bug / sub-optimal behavior for nodes which are experiencing
+transient readiness state or for terminating nodes, since connections are not
+allowed to drain in those cases, even though the load balancer might support
+that. Moreover: on large clusters with a lot nodes and entropy, re-syncing load
+balancers like this can lead to rate-limiting by the cloud provider due to an
+excessive amount of update calls.
+
+As to enable connection draining, reduce cloud provider API calls and simplify
+the KCCMs sync loop: this KEP proposes that the service controller stops
+synchronizing the load balancer node set in these cases. Seeing as how this has
+always been the case, a new feature gate `StableLoadBalancerNodeSet` will
+be introduced, which will be used to enable the more optimal behavior.
+
+## Motivation
+
+Abruptly terminating connections in the cases defined by a) and b) above can be
+seen as buggy behavior and should be improved. By enabling connection draining,
+applications are allowed profit from graceful shutdown / termination, for what
+concerns cluster ingress connectivity. Users of Kubernetes will also see a
+reduction in the amount of cloud API calls, for what concerns calls stemming
+from syncing load balancers with the Kubernetes cluster state.
+
+### Goals
+
+- Stop re-configuring the load balancers' node set for cases a) and b) above
+
+### Non-Goals
+
+- Stop re-configuring the load balancers' node set for fully deleted /
+  newly added cluster nodes, or for nodes which get annotated with
+  `node.kubernetes.io/exclude-from-external-load-balancers`.
+- Enable load balancer connection draining while Node is draining. This requires
+  health check changes.
+
+## Proposal
+
+### Risks and Mitigations
+
+#### Risk
+
+1. Cloud providers which do not allow VM deletion when the VM is referenced by
+  other constructs, will block the cluster auto-scaler (CA) from deleting the VM
+  upon downscale. This will result in reduced downscale performance by the CA,
+  or completely block VM deletion from happening - this is because the service
+  controller will never proceed to de-reference the VM from the load balancer
+  node set until the Node is fully deleted in the API server, which will never
+  occur until the VM is deleted. The three major cloud providers (GCP/AWS/Azure)
+  do however support this, and it is not expected that other providers don't.
+2. Cloud providers which do not configure their load balancer health checks to
+  target the service proxy's healthz, alternatively: constructs which validate
+  the endpoint's reachability across the data plane; risk experiencing
+  regressions as a consequence of the removal of b). This would happen if a node
+  is faced with a terminal error which does impact the Node's network
+  connectivity. Doing this is considered incorrect, and therefor not expected to
+  be the case.
+3. By removing b) above we are delaying the removal of the Node from the load
+  balancers' node set until the Node is completely deleted in the API server.
+  This might have an impact on CA downscaling. The reason for this is: the CA
+  deletes the VM and expects the node controller in the KCCM to notice this and
+  delete the Node in Kubernetes, as a consequence. If the node controller takes
+  a while to sync that and other Node related events trigger load balancer
+  reconciliation while this is happening, then the service controller will error
+  until the cluster reaches steady-state (because it's trying to sync Nodes for
+  which the VM is non-existent). A mitigation to this is presented in
+  [Mitigations](#mitigations)
+
+#### Mitigations
+
+- Cloud providers/workloads which do not support the behavior mentioned in
+  [Risk](#risk), have the possibility to set the feature flag to false, thus
+  default back to the current mechanism.
+- For point 3. we could implement the following change in the service
+  controller; ensure it only enqueues Node UPDATE on changes to
+  `.metadata.labels["node.kubernetes.io/exclude-from-external-load-balancers"]`.
+  When processing the sync: list only Node following the existing predicates
+  defined for `externalTrafficPolicy: Cluster/Local` services (both currently
+  exclude Nodes with the taint `ToBeDeletedByClusterAutoscaler`). This will
+  ensure that whatever Nodes are included in the load balancer set, always have
+  a corresponding VM. Doing this is however reverting on the goal of the KEP.
+
+## Design Details
+
+- Implement the change in the service controller and ensure it does not add /
+  remove nodes from the load balancers' node set for cases a) and b) mentioned
+  in (Summary)[#Summary]
+- Add the feature gate: `StableLoadBalancerNodeSet`, set it to "on" by
+  default and promote it directly to Beta.
+
+### Test Plan
+
+#### Prerequisite testing updates
+
+#### Unit tests
+
+The service controller in the KCCM currently has a set of tests validating
+expected syncs caused by Node predicates, these will need to be updated.
+
+#### Integration tests
+
+Kubernetes is mostly tested via unit tests and e2e, not integration, and this is
+not expected to change.
+
+#### e2e tests
+
+Kubernetes in general needs to extended its load balancing test suite with
+disruption tests, this might be the right effort we need to get that ball
+rolling. Testing would include:
+
+- validation that an application running on a deleting VM benefits from graceful
+  termination of its TCP connection.
+- validation that Node readiness state changes do not result in load balancer
+  re-syncs.
+
+### Graduation Criteria
+
+#### Beta
+
+This is addressing a sub-optimal solution currently existing in Kubernetes, so
+the feature gate will be moved to Beta and "on" by default from the start.
+
+The feature flag should be kept available until we get sufficient evidence of
+people not being affected by anything mentioned in (Risks)[#Risks] or other.
+
+#### GA
+
+Given the lack of reported issues in Beta: the feature gate will be locked-in in
+GA.
+
+Tentative timeline for this is in v1.29. Services of `type: LoadBalancer` are
+sufficiently common on any given Kubernetes cluster, that any cloud provider
+susceptible to the (Risks)[#Risks] will very likely report issues in Beta.
+
+### Upgrade / Downgrade Strategy
+
+Any upgrade to a version enabling the feature, succeeded by a downgrade to a
+version disabling it, is not expected to be impacted in any way. On upgrade: the
+service controller will add all existing cluster nodes (bar excluded ones) to
+the load balancer set. On downgrade: any nodes `NotReady` / tainted will get
+reconciled by the service controller corresponding to the downgraded control
+plane version and get removed from the load balancer set - as they should.
+
+### Version Skew Strategy
+
+This change is contained to only the control plane and is therefor not
+impacted by any version skew.
+
+## Production Readiness Review Questionnaire
+
+### Feature Enablement and Rollback
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+- [X] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: `StableLoadBalancerNodeSet`
+  - Components depending on the feature gate: Kubernetes cloud controller manager
+
+###### Does enabling the feature change any default behavior?
+
+Yes, Kubernetes Nodes will remain in the load balancers' node set until fully
+deleted in the API server, as opposed to the current behavior: which adds /
+removes the nodes from the set when the Node experience transient state changes.
+Cloud providers which do not support deleting VMs which are still referenced by
+load balancers, will be unable to do so upon downscaling by the cluster
+auto-scaler when it attempts to delete the VM.  
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+Yes, by resetting the feature gate back.
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+Behavior will be restored back immediately.
+
+###### Are there any tests for feature enablement/disablement?
+
+N/A
+
+### Rollout, Upgrade and Rollback Planning
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+Rollout and rollback are not expected to fail.
+
+###### What specific metrics should inform a rollback?
+
+Performance degradation by the CA when downscaling / flat out inability to delete VMs.
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+N/A
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+No.
+
+### Monitoring Requirements
+
+The only mechanism currently implemented, is: events for syncing load balancers
+in the KCCM. The events are triggered any time a service is synced or Node
+change triggers a re-sync of all services. This will not change and can be used
+to monitor the implemented change. The implementation will result in less load
+balancer re-syncs.
+
+A new metric `load_balancer_sync_count` will be added for explicitly monitoring
+the amount of load balancer related syncs performed by the service controller.
+This will include load balancer syncs caused by Service and Node changes.
+
+A new metric `nodesync_error_rate` will be added for explicitly monitoring the
+amount of errors produced by syncing Node related events for load balancers. The
+goal is have an indicator of if the service controller is impacted by point 3.
+mentioned in (Risk)[#Risk], and at which frequency.
+
+###### How can an operator determine if the feature is in use by workloads?
+
+Analyze events stemming from the API server, correlating node state changes
+(readiness or addition / removal of the taint: `ToBeDeletedByClusterAutoscaler`)
+to load balancer re-syncs. The events should show a clear reduction in re-syncs
+post the implementation and rollout of the change.
+
+###### How can someone using this feature know that it is working for their instance?
+
+Yes, when a node transition from `Ready` <-> `NotReady` the load balancers are
+not re-synced and the load balancers' node set will remain unchanged for
+`externalTrafficPolicy: Cluster` services. On downscaling by the CA, the node
+will remain the the load balancers' set for a longer period of time, just until
+the Node object in Kubernetes is fully deleted.
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+Total amount of load balancer re-syncs should be reduced, leading to less cloud
+provider API calls. Also, and more subtle: connections will get a chance to
+gracefully terminate when the CA downscales cluster nodes. For services of type
+`externalTrafficPolicy: Cluster` "traversing" connections through a "nexthop"
+node might not be impacted by that Node's readiness state anymore.
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [X] Metrics
+  - Events: The KCCM triggers events when syncing load balancers. The amount of
+    these events should be reduced.
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+N/A
+
+### Dependencies
+
+###### Does this feature depend on any specific services running in the cluster?
+
+It depends on:
+
+- having a `type: LoadBalancer` service with `externalTrafficPolicy: Cluster`
+
+### Scalability
+
+###### Will enabling / using this feature result in any new API calls?
+
+No
+
+###### Will enabling / using this feature result in introducing new API types?
+
+No
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+No
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+No
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+No
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+No
+
+### Troubleshooting
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+Not any different than today.
+
+###### What are other known failure modes?
+
+None
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+Validate that services of `type: LoadBalancer` exists on the cluster and that
+Nodes are experiencing a transitioning readiness state, alternatively that the
+CA downscales and deletes VMs.
+
+## Implementation History
+
+- 2023-02-01: Initial proposal
+
+## Drawbacks
+
+## Alternatives
diff --git a/keps/sig-network/3458-remove-transient-node-predicates-from-service-controller/kep.yaml b/keps/sig-network/3458-remove-transient-node-predicates-from-service-controller/kep.yaml
new file mode 100755
index 000000000000..eea8ff138019
--- /dev/null
+++ b/keps/sig-network/3458-remove-transient-node-predicates-from-service-controller/kep.yaml
@@ -0,0 +1,24 @@
+prnumber: "3460"
+name: 3458-remove-transient-node-predicates-from-service-controller
+title: Remove transient node predicates from KCCM's service controller
+kep-number: "3458"
+authors: ['@alexanderConstantinescu']
+owning-sig: sig-network
+participating-sigs: [sig-network]
+reviewers:
+  - "@thockin"
+approvers:
+  - "@thockin"
+creation-date: "2022-08-07"
+last-updated: v1.27
+status: implementeable
+stage: stable
+latest-milestone: "v1.27"
+milestone:
+  beta: "v1.27"
+  stable: "v1.29"
+feature-gates:
+  - name: StableLoadBalancerNodeSet
+    components:
+      - kube-controller-manager
+disable-supported: true
diff --git a/keps/sig-network/3668-reserved-service-nodeport-range/README.md b/keps/sig-network/3668-reserved-service-nodeport-range/README.md
new file mode 100644
index 000000000000..4b0a626b5ade
--- /dev/null
+++ b/keps/sig-network/3668-reserved-service-nodeport-range/README.md
@@ -0,0 +1,468 @@
+# KEP-3668: Reserve Nodeport Ranges For Dynamic And Static Port Allocation
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [Story 1](#story-1)
+  - [Story 2](#story-2)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Current Services NodePort allocation model](#current-services-nodeport-allocation-model)
+  - [Proposed Services NodePorts allocation model](#proposed-services-nodeports-allocation-model)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
+    - [Beta](#beta)
+    - [GA](#ga)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+Nodeport Service can expose a Service outside the cluster, and allow external applications access to an set of Pods. NodePort has several ports that are widespread in the cluster and allow to load-balance traffic from the external. And the port number can be assigned:
+
+- dynamically, the cluster will pick one within the configured service node port range.
+- statically, the user will set one port within the configured service node port range.
+
+Currently, there is no possibility, before creating a NodePort Service with a static port, to know if the port has been chosen already by any other NodePort using dynamic allocation.
+
+The NodePort Service port range can be logically subdivided to avoid the risk of conflict between NodePort that use static and dynamic port allocation. The idea is mostly the same with [Reserve Service IP Ranges (KEP-3070)](https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/3070-reserved-service-ip-range#release-signoff-checklist) did.
+
+## Motivation
+
+There are many situations that users need to rely on well-known predefined NodePort ports. 
+
+The usage scenario is similar in nature to the pre-assigned Cluster IP addresses described in KEP 3070: in some deployments, IPs or ports need to be hard-coded in some specific components of the cluster. Hard-coding of NodePort typically occurs in private cloud deployments with complex hyper-converged architectures, where we cannot rely on some of the underlying infrastructure, or even need to use the services running on Kubernetes as a base service. As a concrete example, imagine we need to deploy a set of applications in an environment without DNS servers and external LBs, this set of applications includes several K8S-based deployments, but others are non-K8S-based which deploied on baremetal. At this point, CoreDNS is expected to be the only DNS server in the environment, and both need to provide resolution services not only within K8S, but also outside the K8S cluster, where a fixed IP and NodePort is necessary.
+
+During cluster initialization, some services created earlier may take up the CluserIP and NodePort ports that we expect to assign later, the ClusterIP problem is solved in and KEP 3070, while the NodePort problem may still occur.
+
+
+### Goals
+
+- allow to create a Service with a static NodePort with less risk of port conflict.
+
+
+### Non-Goals
+
+- changing the allocators implementation (bitmaps, ...).
+- add new configuration options or flags.
+
+## Proposal
+
+### Story 1
+
+"As a Kubernetes administrator, I would like to be able to assign a fixed port to my DNS Service to provide access to services outside the cluster and reduce the possibility of conflicts with existing NodePort Services within the cluster."
+
+### Story 2
+"As a Kubernetes developer I want to allocate safely fixed ports to some NodePort Services so I can automate some configuration parameters of my cluster"
+
+### Risks and Mitigations
+There is no risk. Every NodePort Service should be able to get an port from the NodePort range as long as there are free ports.
+
+This KEP only subdivides the NodePort range, and prioritize one range over other for dynamically port allocation, without adding limitations to the number of ports assigned.
+
+## Design Details
+
+### Current Services NodePort allocation model
+
+nodePort is an additional port that needs to be bound to all nodes for the service, if a port is specified manually, is in-range, and is not in use,
+it will be allocated to the nodePort; otherwise creation of the service will fail.
+The Service NodePort range is defined in the apiserver with the following flag:
+
+```
+--service-node-port-range <a string in the form 'N1-N2'>     Default: 30000-32767
+A port range to reserve for services with NodePort visibility. This must not overlap with the ephemeral port range on nodes. Example: '30000-32767'. Inclusive at both ends of the range.
+```
+
+In a multi master environment, multiple requests can arrive at same time at different apiservers, the allocation logic must
+guarantee that there are no duplicate port assigned. To solve this consensus problem in a distributed system, allocators are
+implemented using bitmaps that are serialized and stored in an ["opaque" API object](https://github.com/kubernetes/kubernetes/blob/6a16d7d31aeb0f95c4ae513311bfecef9492f30e/pkg/apis/core/types.go#L5714).
+
+```go
+// RangeAllocation is an opaque API object (not exposed to end users) that can be persisted to record
+// the global allocation state of the cluster. The schema of Range and Data generic, in that Range
+// should be a string representation of the inputs to a range (for instance, for IP allocation it
+// might be a CIDR) and Data is an opaque blob understood by an allocator which is typically a
+// binary range.  Consumers should use annotations to record additional information (schema version,
+// data encoding hints). A range allocation should *ALWAYS* be recreatable at any time by observation
+// of the cluster, thus the object is less strongly typed than most.
+type RangeAllocation struct {
+	metav1.TypeMeta
+	// +optional
+	metav1.ObjectMeta
+	// A string representing a unique label for a range of resources, such as a CIDR "10.0.0.0/8" or
+	// port range "10000-30000". Range is not strongly schema'd here. The Range is expected to define
+	// a start and end unless there is an implicit end.
+	Range string
+	// A byte array representing the serialized state of a range allocation. Additional clarifiers on
+	// the type or format of data should be represented with annotations. For IP allocations, this is
+	// represented as a bit array starting at the base IP of the CIDR in Range, with each bit representing
+	// a single allocated address (the fifth bit on CIDR 10.0.0.0/8 is 10.0.0.4).
+	Data []byte
+}
+```
+
+The [apiserver Service registry contains allocators backed by those bitmaps](p[kg/registry/core/rest/storage_core.go](https://github.com/kubernetes/kubernetes/blob/2ac6a4121f5b2a94acc88d62c07d8ed1cd34ed63/pkg/registry/core/rest/storage_core.go#L96-L103)) in order to assign these ports.
+
+```go
+type LegacyRESTStorage struct {
+	ServiceClusterIPAllocator          rangeallocation.RangeRegistry
+	SecondaryServiceClusterIPAllocator rangeallocation.RangeRegistry
+	ServiceNodePortAllocator           rangeallocation.RangeRegistry
+}
+
+...
+func (c LegacyRESTStorageProvider) NewLegacyRESTStorage(apiResourceConfigSource serverstorage.APIResourceConfigSource, restOptionsGetter generic.RESTOptionsGetter) (LegacyRESTStorage, genericapiserver.APIGroupInfo, error) {
+  ...
+	var serviceNodePortRegistry rangeallocation.RangeRegistry
+	serviceNodePortAllocator, err := portallocator.New(c.ServiceNodePortRange, func(max int, rangeSpec string) (allocator.Interface, error) {
+		mem := allocator.NewAllocationMap(max, rangeSpec)
+		// TODO etcdallocator package to return a storage interface via the storageFactory
+		etcd, err := serviceallocator.NewEtcd(mem, "/ranges/servicenodeports", serviceStorageConfig.ForResource(api.Resource("servicenodeportallocations")))
+		if err != nil {
+			return nil, err
+		}
+		serviceNodePortRegistry = etcd
+		return etcd, nil
+	})
+```
+
+The bitmaps implement a [strategy for allocating a random free value if none is specified](https://github.com/kubernetes/kubernetes/blob/2ac6a4121f5b2a94acc88d62c07d8ed1cd34ed63/pkg/registry/core/service/allocator/bitmap.go#L186-L205):
+
+```go
+// randomScanStrategy chooses a random address from the provided big.Int, and then
+// scans forward looking for the next available address (it will wrap the range if
+// necessary).
+type randomScanStrategy struct {
+	rand *rand.Rand
+}
+
+func (rss randomScanStrategy) AllocateBit(allocated *big.Int, max, count int) (int, bool) {
+	if count >= max {
+		return 0, false
+	}
+	offset := rss.rand.Intn(max)
+	for i := 0; i < max; i++ {
+		at := (offset + i) % max
+		if allocated.Bit(at) == 0 {
+			return at, true
+		}
+	}
+	return 0, false
+}
+```
+
+### Proposed Services NodePorts allocation model
+
+The strategy proposaled here is mentioned and implemented in (KEP 3070)[https://github.com/kubernetes/enhancements/blob/master/keps/sig-network/3070-reserved-service-ip-range/README.md#proposed-services-clusterips-allocation-model] first, with some fine-tuning. The following is a specific explanation of the strategy.
+
+the strategy following formula `min(max($min, node-range-size/$step), $max)`, described as never less than $min or more than $max, with a graduated step function between them, with $min = 16, $max = 128 and $step = 32. If node-range-size < $min the formula doesn't apply and the strategy will not change, both dynamic and static ports will be allocated from the whole range with equal probability.
+
+- lower band, used preferably for static port assignment.
+- upper band, used preferably for dynamic port assignment.
+  
+Dynamically port assignment will use the upper band by default, once this has been exhausted it will use the lower range. This will allow users to define static allocations on the lower band of the predefined range with a low risk of collision, keeping the implementation backwards compatible.
+
+
+Example 1 Default service nodeport range:
+
+- Service Node Port Range: 30000-32767
+- Range Size: 32767 - 30000 = 2767
+- Band Offset: min(max(16,2767/32),128) = min(86,128) = 86
+- Static band start: 30000
+- Static band ends: 30086
+
+           ┌─────────────┬─────────────────────────────────────────────┐
+           │   static    │                    dynamic                  │
+           └─────────────┴─────────────────────────────────────────────┘
+
+           ◄────────────► ◄────────────────────────────────────────────►
+          30000        30086                                          32767
+
+
+Example 2 Large service nodeport range:
+
+- Service Node Port Range: 20000-32767
+- Range Size: 32767 - 30000 = 12767
+- Band Offset: min(max(16,12767/32),128) = min(398,128) = 128
+- Static band start: 20000
+- Static band ends: 20128
+
+           ┌─────────────┬─────────────────────────────────────────────┐
+           │   static    │                    dynamic                  │
+           └─────────────┴─────────────────────────────────────────────┘
+
+           ◄────────────► ◄────────────────────────────────────────────►
+          20000        20128                                          32767
+
+
+Example 3 Small service nodeport range:
+
+- Service Node Port Range: 32567-32767
+- Range Size: 32767 - 32567 = 200
+- Band Offset: min(max(16,200/32),128) = min(max(16, 6),128) = 16
+- Static band start: 32567
+- Static band ends: 32583
+
+           ┌─────────────┬─────────────────────────────────────────────┐
+           │   static    │                    dynamic                  │
+           └─────────────┴─────────────────────────────────────────────┘
+
+           ◄────────────► ◄────────────────────────────────────────────►
+          32567        32583                                          32767
+
+
+
+### Test Plan
+
+[x] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+None
+
+##### Unit tests
+- `kubernetes/pkg/registry/core/service/portallocator`: `12/10/2022` - `83.1`
+
+##### Integration tests
+
+This feature doesn't modify the cluster behavior, only the order on which dynamic port are assigned to NodePort Services, there is no need for e2e or integration tests, unit tests are enough.
+
+##### e2e tests
+
+### Graduation Criteria
+
+#### Alpha
+
+- Feature implemented behind a feature flag
+
+#### Beta
+
+- Gather feedback from developers and community.
+- No issues reported.
+
+#### GA
+
+- No issues reported during two releases.
+
+
+### Upgrade / Downgrade Strategy
+
+The feature only changes the allocation strategy, the bitmaps and the underlay logic remain the same,
+guaranteeing full compatibility and no risk rolling back the feature.
+
+### Version Skew Strategy
+
+Same as with upgrade, there is no change in the bitmaps, keeping the feature 100% compatible.
+
+
+## Production Readiness Review Questionnaire
+
+### Feature Enablement and Rollback
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+- [x] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: ServiceNodePortStaticSubrange
+  - Components depending on the feature gate: kube-apiserver
+- [ ] Other
+  - Describe the mechanism:
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled).
+
+###### Does enabling the feature change any default behavior?
+
+Dynamic allocated NodePort will not be initially chosen from the first X values of the Service Node Port range, where X is obtained from the formula
+described in [Proposed Services NodePorts allocation model](#proposed-services-nodeports-allocation-model)
+This change is transparent to the user, since the port is and was already random, it doesn't matter from which range the port is obtained.
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+Yes.
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+Nothing, this only affects new requests and how these new request obtain the dynamically assigned ports.
+
+
+###### Are there any tests for feature enablement/disablement?
+
+No need to add new test, current tests should keep working as today.
+
+### Rollout, Upgrade and Rollback Planning
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+Current Services will not be affected, it doesn't matter in which range their port are allocated.
+This only affects the creation of new NodePort Services without an port specified, there is no risk of impact on running workloads.
+
+###### What specific metrics should inform a rollback?
+
+There is currently no metric available for the NodePort allocation. so 4 metric should be added.
+
+  - kube_apiserver_nodeport_allocator_allocated_ports
+  - kube_apiserver_nodeport_allocator_available_ports
+  - kube_apiserver_nodeport_allocator_allocation_total
+  - kube_apiserver_nodeport_allocator_allocation_errors_total
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+Since it is a purely in-memory feature the upgrade or downgrande doesn't have any impact.
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+no
+
+### Monitoring Requirements
+
+Following allocator metrics have a new label to each metric containing information about the type of allocation requested (dynamic or static):
+
+  - kube_apiserver_nodeport_allocator_allocation_total
+  - kube_apiserver_nodeport_allocator_allocation_errors_total
+
+The other allocator the metrics can not use the additional label because, when an port is released, there is not information on how the port was allocated.
+
+  - kube_apiserver_nodeport_allocator_allocated_ports
+  - kube_apiserver_nodeport_allocator_available_ports
+
+###### How can an operator determine if the feature is in use by workloads?
+
+An operator will only observe that the change in behavior described for dynamically assigned NodePort for Services.
+
+###### How can someone using this feature know that it is working for their instance?
+
+- [ ] Events
+  - Event Reason: 
+- [ ] API .status
+  - Condition name: 
+  - Other field: 
+- [X] Other (treat as last resort)
+  - Details: Create services without setting the nodePort and observe that all the services ports allocated belong the upper band of the range, and once the upper band is exhausted it keep assigning ports on the lower band until the whole range is exhausted.
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+N/A
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+This proposal adds 4 more metrics, and two of them is used to observe this feature. I think there is no any missing metrics should be added in feature.
+
+### Dependencies
+
+###### Does this feature depend on any specific services running in the cluster?
+
+No
+### Scalability
+
+###### Will enabling / using this feature result in any new API calls?
+
+No
+###### Will enabling / using this feature result in introducing new API types?
+
+No
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+No
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+No
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+No
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+No
+
+###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
+
+No
+
+### Troubleshooting
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+N/A
+###### What are other known failure modes?
+
+N/A
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+## Drawbacks
+
+## Alternatives
diff --git a/keps/sig-network/3668-reserved-service-nodeport-range/kep.yaml b/keps/sig-network/3668-reserved-service-nodeport-range/kep.yaml
new file mode 100755
index 000000000000..7261316646a1
--- /dev/null
+++ b/keps/sig-network/3668-reserved-service-nodeport-range/kep.yaml
@@ -0,0 +1,40 @@
+title: Reserve Nodeport Ranges For Dynamic And Static Port Allocation
+kep-number: 3668
+authors:
+  - "@xuzhenglun"
+owning-sig: sig-network
+participating-sigs:
+status: implementable
+creation-date: 2022-11-30
+reviewers:
+approvers:
+  - "@thockin"
+  - "@aojea"
+
+see-also:
+  - "/keps/sig-network/3070-reserved-service-ip-range"
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.27"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.27"
+  beta: "v1.28"
+  stable: "v1.30"
+
+# The following PRR answers are required at alpha release
+# List the feature gate name and the components for which it must be enabled
+feature-gates:
+  - name: ServiceNodePortStaticSubrange
+    components:
+      - kube-apiserver
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics:
diff --git a/keps/sig-network/3705-cloud-node-ips/README.md b/keps/sig-network/3705-cloud-node-ips/README.md
new file mode 100644
index 000000000000..7baba7274423
--- /dev/null
+++ b/keps/sig-network/3705-cloud-node-ips/README.md
@@ -0,0 +1,766 @@
+# KEP-3705: Cloud Dual-Stack --node-ip Handling
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Current behavior](#current-behavior)
+  - [Changes to <code>--node-ip</code>](#changes-to-)
+  - [Changes to the <code>provided-node-ip</code> annotation](#changes-to-the--annotation)
+  - [Changes to cloud providers](#changes-to-cloud-providers)
+  - [Example of <code>--node-ip</code> possibilities](#example-of--possibilities)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
+    - [Beta](#beta)
+    - [GA](#ga)
+    - [Deprecation](#deprecation)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+Kubelet supports dual-stack `--node-ip` values for clusters with
+no cloud provider (eg, "bare metal" clusters), but not for clusters
+using a cloud provider. This KEP proposes to fix that.
+
+## Motivation
+
+### Goals
+
+- Allow administrators of clusters using external cloud providers to
+  override the cloud-selected node IPs in a more dual-stack-aware
+  manner:
+
+    - Allow administrators to override a single node IP on a node in a
+      dual-stack cluster without causing the node to become
+      single-stack.
+
+    - Allow administrators to override both node IPs on a node in a
+      dual-stack cluster.
+
+    - Allow administrators to override the order of node IPs on a node
+      in a dual-stack cluster (ie, requesting that nodes be
+      IPv6-primary dual stack rather than IPv4-primary dual stack, or
+      vice versa).
+
+    - Allow administrators to configure nodes to be single-stack when
+      the cloud provider suggests dual-stack, without needing to
+      provide a specific IPv4 or IPv6 node IP.
+
+- Define how kubelet will communicate these new intents to cloud
+  providers.
+
+- Update the code in `k8s.io/cloud-provider/node/helpers` to implement
+  the needed algorithms for the new behaviors.
+
+- Rename the `alpha.kubernetes.io/provided-node-ip` annotation, which
+  has not been alpha for a long time.
+
+### Non-Goals
+
+- Changing the behavior of nodes using legacy cloud providers.
+
+- Changing the node-IP-selection behavior in any currently-existing
+  Kubernetes cluster. This means that the default behavior when
+  `--node-ip` is not specified will remain the same, and the behavior
+  of any currently-allowed `--node-ip` value will remain the same. New
+  behavior will only be triggered by `--node-ip` values that would
+  have been rejected by kubelet in older clusters.
+
+- Adding the ability for nodes in clusters with cloud providers to use
+  node IPs that are not already known to the cloud provider. (In
+  particular, this implies that we will continue to not support
+  dual-stack nodes in clouds that do not themselves support
+  dual-stack.)
+
+- Improving the behavior of autodetecting the node IP in any other
+  ways. (Eg, being able to pass a CIDR rather than an IP.) There was
+  some discussion of ways we might do this in the abandoned [KEP-1664]
+  (which this KEP is a spiritual successor to), but if we want that,
+  it can be done later, independently of these changes. (These changes
+  were mostly desired in the context of non-cloud-provider clusters
+  anyway.)
+
+[KEP-1664]: https://github.com/kubernetes/enhancements/issues/1664
+
+## Proposal
+
+### Risks and Mitigations
+
+As the intention is to not change the user-visible behavior except in
+clusters where administrators explicitly make use of the new
+functionality, there should be no risk of breaking existing clusters,
+nor of surprising administrators by suddenly exposing node services on
+unexpected IPs.
+
+## Design Details
+
+### Current behavior
+
+Currently, when `--cloud-provider` is passed to kubelet, kubelet
+expects `--node-ip` to be either unset, or a single IP address. (If it
+is unset, that is equivalent to passing `--node-ip 0.0.0.0`, which
+means "autodetect an IPv4 address, or if there are no usable IPv4
+addresses, autodetect an IPv6 address".)
+
+If `--cloud-provider` and `--node-ip` are both specified (and
+`--node-ip` is not "`0.0.0.0`" or "`::`"), then kubelet will add an
+annotation to the node, `alpha.kubernetes.io/provided-node-ip`. Cloud
+providers expect this annotation to conform to the current expected
+`--node-ip` syntax (ie, a single value); if it does not, then they
+will log an error and not remove the
+`node.cloudprovider.kubernetes.io/uninitialized` taint from the node,
+causing the node to remain unusable until kubelet is restarted with a
+valid (or absent) `--node-ip`.
+
+When `--cloud-provider` is not passed, the `--node-ip` value can also
+be a comma-separated pair of dual-stack IP addresses. However, unlike
+in the single-stack case, the IPs in the dual-stack case are not
+currently allowed to be "unspecified" IPs (ie `0.0.0.0` or `::`); you
+can only make a (non-cloud) node be dual-stack if you explicitly
+specify both IPs that you want it to use.
+
+### Changes to `--node-ip`
+
+The most obvious required change is that we need to allow
+comma-separated dual-stack `--node-ip` values in clusters using
+external cloud providers (but _not_ in clusters using legacy cloud
+providers).
+
+Additionally, the fact that kubelet does not currently pass
+"`0.0.0.0`" and "`::`" to the cloud provider creates a compatibility
+problem: we would like for administrators to be able to say "use an
+IPv6 node IP but I don't care which one" in cloud-provider clusters
+like they can in non-cloud-provider clusters, but for compatibility
+reasons, we can't change the existing behavior of "`--cloud-provider
+external --node-ip ::`" (which doesn't do what it's "supposed to", but
+does have possibly-useful side effects that have led some users to use
+it anyway; see [kubernetes #111695].)
+
+So instead, we will add new syntax, and allow administrators to say
+"`--node-ip IPv4`" or "`--node-ip IPv6`" if they want to explicitly
+require that the cloud provider pick a node IP of a specific family.
+(This also improves on the behavior of the existing "`0.0.0.0`" and
+"`::`" options, because you almost never actually want the "or fall
+back to the other family if there are no IPs of this family" behavior
+that "`0.0.0.0`" and "`::`" imply.)
+
+Additionally, we will update the code to allow including "`IPv4`" and
+"`IPv6`" in dual-stack `--node-ip` values as well (in both cloud and
+non-cloud clusters). This code will have to check the status of the
+feature gate until the feature is GA.
+
+[kubernetes #111695]: https://github.com/kubernetes/kubernetes/issues/111695
+
+### Changes to the `provided-node-ip` annotation
+
+Currently, if the user passes an IP address to `--node-ip` which is
+not recognized by the cloud provider as being a valid IP for that
+node, kubelet will set that value in the `provided-node-ip`
+annotation, and the cloud provider will see it, realize that the node
+IP request can't be fulfilled, log an error, and leave the node in the
+tainted state.
+
+It makes sense to have the same behavior if the user passes a "new"
+(eg, dual-stack) `--node-ip` value to kubelet, but the cloud provider
+does not recognize the new syntax and thus can't fulfil the request.
+Conveniently, we can do this just by passing the dual-stack
+`--node-ip` value in the existing annotation; the old cloud provider
+will try to parse it as a single IP address, fail, log an error, and
+leave the node in the tainted state, which is exactly what we wanted
+it to do if it can't interpret the `--node-ip` value correctly.
+
+Therefore, we do not _need_ a new annotation for the new `--node-ip`
+values; we can continue to use the existing annotation, assuming
+existing cloud providers will treat unrecognized values as errors.
+
+That said, the existing annotation name is
+`alpha.kubernetes.io/provided-node-ip` but it hasn't been "alpha" for
+a long time. We should fix this. So:
+
+  1. When `--node-ip` is unset, kubelet should delete both the legacy
+     `alpha.kubernetes.io/provided-node-ip` annotation and the new
+     `kubernetes.io/provided-node-ip` annotation (regardless of
+     whether the feature gate is enabled or not, to avoid problems
+     with rollback and skew).
+
+  2. When the `CloudNodeIPs` feature is enabled and `--node-ip` is
+     set, kubelet will set both the legacy annotation and the new
+     annotation. (It always sets them both to the same value, even if
+     that's a value that old cloud providers won't understand).
+
+  2. When the `CloudNodeIPs` feature is enabled, the cloud provider
+     will use the new `kubernetes.io/provided-node-ip` annotation _if
+     the legacy alpha annotation is not set_. (But if both annotations
+     are set, it will prefer the legacy annotation, so as to handle
+     rollbacks correctly.)
+
+  3. A few releases after GA, kubelet can stop setting the legacy
+     annotation, and switch to unconditionally deleting it, and
+     setting/deleting the new annotation depending on whether
+     `--node-ip` was set or not. Cloud providers will also switch to
+     only using the new annotation, and perhaps logging a warning if
+     they see a node with the old annotation but not the new
+     annotation.
+
+Kubelet will preserve the existing behavior of _not_ passing
+"`0.0.0.0`" or "`::`" to the cloud provider, even via the new
+annotation. This is needed to preserve backward compatibility with
+current behavior in clusters using those `--node-ip` values. However,
+it _will_ pass "`IPv4`" and/or "`IPv6`" if they are passed as the
+`--node-ip`.
+
+### Changes to cloud providers
+
+Assuming that all cloud providers use the `"k8s.io/cloud-provider"`
+code to handle the node IP annotation and node address management, no
+cloud-provider-specific changes should be needed; we should be able to
+make the needed changes in the `cloud-provider` module, and then the
+individual providers just need to revendor to the new version.
+
+### Example of `--node-ip` possibilities
+
+Assume a node where the cloud has assigned the IPs `1.2.3.4`,
+`5.6.7.8`, `abcd::1234` and `abcd::5678`, in that order of preference.
+
+("SS" = "Single-Stack", "DS" = "Dual-Stack")
+
+| `--node-ip` value    | New? | Annotation             | Resulting node addresses |
+|----------------------|------|------------------------|--------------------------|
+| (none)               | no   | (unset)                | `["1.2.3.4", "5.6.7.8", "abcd::1234", "abcd::5678"]` (DS IPv4-primary) |
+| `0.0.0.0`            | no   | (unset)                | `["1.2.3.4", "5.6.7.8", "abcd::1234", "abcd::5678"]` (DS IPv4-primary) |
+| `::`                 | no   | (unset)                | `["1.2.3.4", "5.6.7.8", "abcd::1234", "abcd::5678"]` (DS IPv4-primary *) |
+| `1.2.3.4`            | no   | `"1.2.3.4"`            | `["1.2.3.4"]` (SS IPv4) |
+| `9.10.11.12`         | no   | `"9.10.11.12"`         | (error, because the requested IP is not available) |
+| `abcd::5678`         | no   | `"abcd::5678"`         | `["abcd::5678"]` (SS IPv6) |
+| `1.2.3.4,abcd::1234` | yes* | `"1.2.3.4,abcd::1234"` | `["1.2.3.4", "abcd::1234"]` (DS IPv4-primary) |
+| `IPv4`               | yes  | `"IPv4"`               | `["1.2.3.4"]` (SS IPv4) |
+| `IPv6`               | yes  | `"IPv6"`               | `["abcd::1234"]` (SS IPv6) |
+| `IPv4,IPv6`          | yes  | `"IPv4,IPv6"`          | `["1.2.3.4", "abcd::1234"]` (DS IPv4-primary) |
+| `IPv6,5.6.7.8`       | yes  | `"IPv6,5.6.7.8"`       | `["abcd::1234", "5.6.7.8"]` (DS IPv6-primary) |
+| `IPv4,abcd::ef01`    | yes  | `"IPv4,abcd::ef01"`    | (error, because the requested IPv6 IP is not available) |
+
+Notes:
+
+  - In the `--node-ip ::` case, kubelet will be expecting a
+    single-stack IPv6 or dual-stack IPv6-primary setup and so would
+    get slightly confused in this case since the cloud gave it a
+    dual-stack IPv4-primary configuration. (In particular, you would
+    have IPv4-primary nodes but IPv6-primary pods.)
+
+  - `--node-ip 1.2.3.4,abcd::ef01` was previously valid syntax when
+    using no `--cloud-provider`, but was not valid for cloud kubelets.
+
+<<[UNRESOLVED multiple-addresses ]>>
+
+In the examples above, `--node-ip IPv4` means "pick any one IPv4
+address", not "use all IPv4 addresses". This seemed to me to be
+consistent with the behavior when specifying a specific IP address (in
+which case all other IPs of the same NodeAddressType are removed from
+the list), but is inconsistent with the behavior of `--node-ip
+0.0.0.0` / `--node-ip ::` with legacy cloud providers and bare metal
+(where that affects the _sorting_ of IPs but does not do any
+_filtering_).
+
+In past discussions (eg https://kep.k8s.io/1664,
+https://issues.k8s.io/95768) no one has ever been entirely sure what
+the point of having multiple node.status.addresses values is, and
+there has never been a way to get multiple node.status.addresses
+values (of the same IP family) on "bare metal", so that would tend to
+prevent people from designing features that depended on multiple node
+addresses. So it's not clear that there's really any downside in
+filtering out the "unused" node IPs...
+
+<<[/UNRESOLVED]>>
+
+If the cloud only had IPv4 IPs for the node, then the same examples would look like:
+
+| `--node-ip` value    | New? | Annotation             | Resulting node addresses |
+|----------------------|------|------------------------|--------------------------|
+| (none)               | no   | (unset)                | `["1.2.3.4", "5.6.7.8"]` (SS IPv4) |
+| `0.0.0.0`            | no   | (unset)                | `["1.2.3.4", "5.6.7.8"]` (SS IPv4) |
+| `::`                 | no   | (unset)                | `["1.2.3.4", "5.6.7.8"]` (SS IPv4 *) |
+| `1.2.3.4`            | no   | `"1.2.3.4"`            | `["1.2.3.4"]` (SS IPv4) |
+| `9.10.11.12`         | no   | `"9.10.11.12"`         | (error, because the requested IP is not available) |
+| `abcd::5678`         | no   | `"abcd::5678"`         | (error, because the requested IP is not available) |
+| `1.2.3.4,abcd::1234` | yes* | `"1.2.3.4,abcd::1234"` | (error, because the requested IPv6 IP is not available) |
+| `IPv4`               | yes  | `"IPv4"`               | `["1.2.3.4"]` (SS IPv4) |
+| `IPv6`               | yes  | `"IPv6"`               | (error, because no IPv6 IPs are available) |
+| `IPv4,IPv6`          | yes  | `"IPv4,IPv6"`          | (error, because no IPv6 IPs are available) |
+| `IPv6,5.6.7.8`       | yes  | `"IPv6,5.6.7.8"`       | (error, because no IPv6 IPs are available) |
+| `IPv4,abcd::ef01`    | yes  | `"IPv4,abcd::ef01"`    | (error, because the requested IPv6 IP is not available) |
+
+In this case, kubelet would be even more confused in the
+`--node-ip ::` case, and some things would likely not work.
+By contrast, with `--node-ip IPv6`, the user would get a clear error.
+
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+The goal is to ensure that we don't accept enhancements with inadequate testing.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+[X] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+##### Unit tests
+
+Most of the changes will be in `k8s.io/cloud-provider/node/helpers`.
+There will also be small changes in kubelet startup.
+
+- `k8s.io/kubernetes/pkg/kubelet`: `2023-01-30` - `66.9`
+- `k8s.io/kubernetes/pkg/kubelet/nodestatus`: `2023-01-30` - `91.2`
+- `k8s.io/kubernetes/vendor/k8s.io/cloud-provider/node/helpers`: `2023-01-30` - `31.7`
+- `k8s.io/kubernetes/vendor/k8s.io/cloud-provider/node/helpers/address.go`: `2023-01-30` - `100`
+
+##### Integration tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+-->
+
+```
+<<[UNRESOLVED integration ]>>
+
+I don't know what existing integration testing of kubelet and cloud
+providers there is. Given unit tests for the new `--node-ip` parsing
+and `node.status.addresses`-setting code, and e2e tests of some sort,
+we probably don't need integration tests.
+
+<<[/UNRESOLVED]>>
+```
+
+- <test>: <link to test coverage>
+
+##### e2e tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+-->
+
+```
+<<[UNRESOLVED e2e ]>>
+
+I'm not sure how we currently handle cloud-provider e2e. GCP does not
+support IPv6 and `kind` does not use a cloud provider, so we cannot
+test the new code/behavior in any of the "core" e2e jobs.
+
+@aojea suggests we _could_ use `kind` with an external cloud provider.
+This would presumably have to be a bespoke dummy cloud provider, but
+since we are only doing this to test the new `k/cloud-provider` code,
+having a mostly-dummied-out cloud provider should not be a problem? It
+could perhaps have a default `NodeAddresses` return value of
+"<intentionally-wrong-IPv4-IP>, <correct-IPv6-IP>,
+<intentionally-wrong-IPv6-IP>, <correct-IPv4-IP>", and then we could
+pass `--node-ip <correct-IPv4-IP>,IPv6` to kubelet. If the cloud
+provider did not receive and interpret the `--node-ip` value
+correctly, then we would end up with node IPs that didn't work and the
+test job should fail.
+
+<<[/UNRESOLVED]>>
+```
+
+- <test>: <link to test coverage>
+
+### Graduation Criteria
+
+#### Alpha
+
+- New `--node-ip` handling and annotation implemented behind a feature flag
+- Unit tests updated
+
+- Initial e2e tests completed and enabled? Unless maybe the e2e tests
+  will take a bunch of development work on either kind or the
+  cloud-provider CI infrastructure, in which case we might decide to
+  pass on e2e testing in Alpha?
+
+#### Beta
+
+- Positive feedback / no bugs
+- At least one release after Alpha
+- Decision made about whether `--node-ip IPv4` means 1 IPv4 IP or all IPv4 IPs
+- e2e tests definitely completed and enabled if they weren't already in Alpha
+- e2e test that new annotation gets cleared on rollback
+
+#### GA
+
+- Positive feedback / no bugs
+- Two releases after Beta to allow time for feedback
+
+#### Deprecation
+
+- Two releases after GA, kubelet will stop setting the legacy
+  `alpha.kubernetes.io/provided-node-ip` annotation (and start
+  deleting it). Cloud providers will stop honoring the legacy
+  annotation.
+
+- (Two releases after that, we can remove the code to delete the
+  legacy annotation when present.)
+
+### Upgrade / Downgrade Strategy
+
+No behavioral changes will happen automatically on upgrade, or
+automatically on feature enablement; users must opt in to the feature
+by changing their kubelet configuration after upgrading the cluster to
+a version that supports the new feature.
+
+On downgrade/disablement, it is necessary to revert the kubelet
+configuration changes before downgrading kubelet, or kubelet will fail
+to start after downgrade.
+
+### Version Skew Strategy
+
+- Old kubelet / old cloud provider: Kubelet will only set the old
+  annotation, and the cloud provider will only read the old
+  annotation. Everything works (even if there's a stale new annotation
+  left over from a cluster rollback).
+
+- Old kubelet / new cloud provider: Kubelet will only set the old
+  annotation. The cloud provider will read the old annotation, and
+  will interpret it in the same way an old cloud provider would
+  (because all `--node-ip` values accepted by an old kubelet are
+  interpreted the same way by both old and new cloud providers).
+  Everything works (even if there's a stale new annotation left over
+  from a kubelet rollback, because the new cloud provider still
+  prefers the old annotation).
+
+- New kubelet, single-stack `--node-ip` value / old cloud provider:
+  Kubelet will set both annotations (to the same value). The cloud
+  provider will read the old annotation. Everything works, because
+  this is an "old" `--node-ip` value, and the old cloud provider knows
+  how to interpret it correctly.
+
+- New kubelet, dual-stack `--node-ip` value / old cloud provider:
+  Kubelet will set both annotations (to the same value). The cloud
+  provider will read the old annotation, but it will _not_ know how to
+  interpret it because it's a "new" value. So it will log an error and
+  leave the node tainted. (This is the desired result, since the cloud
+  provider is not able to obey the `--node-ip` value the administrator
+  provided.)
+
+- New kubelet / new cloud provider: Kubelet will set both annotations
+  (to the same value). The cloud provider will read the old
+  annotation, and will know how to interpret it regardless of whether
+  it's an "old" or "new" value. Everything works.
+
+- Future (post-GA) kubelet / GA cloud provider: Kubelet will set
+  the new annotation, and delete the old annotation if present. The
+  cloud provider will see that only the new annotation is set, and so
+  will use that. Everything works (even if there had been a stale old
+  annotation left over from the upgrade).
+
+- GA kubelet / Future (post-GA) cloud provider: Kubelet will set
+  both annotations (to the same value). The cloud provider will only
+  look at the new annotation. Everything works.
+
+- Future (post-GA) kubelet / Future (post-GA) cloud provider: Kubelet
+  will set the new annotation, and delete the old annotation if
+  present. The cloud provider will only look at the new annotation.
+  Everything works.
+
+## Production Readiness Review Questionnaire
+
+### Feature Enablement and Rollback
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+- [X] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: CloudNodeIPs
+  - Components depending on the feature gate:
+    - kubelet
+    - cloud-controller-manager
+
+###### Does enabling the feature change any default behavior?
+
+Kubelet will begin setting two annotations on each node rather than
+one. However, cloud providers will still prefer the old annotation and
+will only read the new annotation if the old one isn't there.
+
+So assuming no bugs, the only visible change after (just) enabling the
+feature is the duplicate annotation.
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+Yes, as long as you also roll back the kubelet configuration to no
+longer use the new feature.
+
+One edge case: if a user who was not previously setting `--node-ip`
+upgrades, enables the feature, starts using `--node-ip`, and then
+rolls back the kubelet and its configuration but _not_ the cloud
+provider, then they will end up with a node that has a stale "new"
+annotation (with the `--node-ip` value they had been using while
+upgraded), but no "old" annotation (because the rolled-back kubelet
+configuration no longer specifies `--node-ip` so kubelet will delete
+the old annotation). The (not-rolled-back) cloud provider will then
+mistake this as being the future rather than the past, assuming
+kubelet intentionally set only the new annotation and not the old
+annotation, and so it will obey the (stale) new annotation.
+
+That could be prevented by stretching out the feature enablement over
+a few more releases, and not having the cloud-provider switch from
+"always use the old annotation" to "prefer the old annotation when
+it's set but use the new annotation if it's the only one" until a few
+releases later. If we were going to do that, then perhaps we should
+split the annotation renaming into a separate KEP from the dual-stack
+`--node-ip` handling, so that the edge cases of the annotation
+renaming don't have to slow down the dual-stack `--node-ip` adoption.
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+It works.
+
+###### Are there any tests for feature enablement/disablement?
+
+TBD; we should have a test that the annotation is cleared on rollback
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+The operator is the one who would be using the feature (and they can
+tell by looking at the kubelet configuration to see if a "new"
+`--node-ip` value was passed).
+
+###### How can someone using this feature know that it is working for their instance?
+
+The Node will have the IPs they expect it to have.
+
+- [X] API .status
+  - Other field: node.status.addresses
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+N/A. This changes the startup behavior of Nodes (and does not affect
+startup speed). There is no ongoing "service".
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+N/A. This changes the startup behavior of Nodes (and does not affect
+startup speed). There is no ongoing "service".
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+No
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+The feature depends on kubelet/cloud provider communication, but it is
+just an update to an existing feature that already depends on
+kubelet/cloud provider communication. It does not create any
+additional dependencies, and it does not add any new failure modes if
+either component fails.
+
+### Scalability
+
+###### Will enabling / using this feature result in any new API calls?
+
+No
+
+###### Will enabling / using this feature result in introducing new API types?
+
+No
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+No
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+Not really. Kubelet will now create two node-IP-related annotations on
+each Node rather than just one, and the value of the annotation may
+be slightly larger (eg because it now contains two IP addresses rather
+than one), and some users may change their `--node-ip` to take
+advantage of the new functionality in a way that would cause more node
+IPs to be exposed in `node.status.addresses` than before.
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+No
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+No
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+It does not add any new failure modes. (The kubelet and cloud provider
+use an annotation and an object field to communicate with each other,
+but they _already_ do that. And the failure mode there is just
+"updates don't get processed until the API server comes back".)
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+N/A: there are no SLOs.
+
+## Implementation History
+
+- Initial proposal: 2022-12-30
+
+## Drawbacks
+
+The status quo is slightly simpler, but some people need the
+additional functionality.
+
+## Alternatives
+
+In the discussion around [KEP-1664] there was talk of replacing
+`--node-ip` with a new `--node-ips` (plural) field, but this was
+mostly because the behavior of `--node-ip` in clusters with external
+cloud providers was incompatible with the behavior of `--node-ip` in
+clusters with legacy or no cloud providers and I wasn't sure if we
+could get away with changing it to make it consistent. However,
+[kubernetes #107750] did just that already, so there's now nothing
+standing in the way of synchronizing the rest of the behavior.
+
+[KEP-1664]: https://github.com/kubernetes/enhancements/issues/1664
+[kubernetes #107750]: https://github.com/kubernetes/kubernetes/pull/107750
diff --git a/keps/sig-network/3705-cloud-node-ips/kep.yaml b/keps/sig-network/3705-cloud-node-ips/kep.yaml
new file mode 100644
index 000000000000..ec689cb80ac7
--- /dev/null
+++ b/keps/sig-network/3705-cloud-node-ips/kep.yaml
@@ -0,0 +1,49 @@
+title: Cloud Dual-Stack --node-ip Handling
+kep-number: 3705
+authors:
+  - "@danwinship"
+owning-sig: sig-network
+participating-sigs:
+  - sig-node
+  - sig-cloud-provider
+status: implementable
+creation-date: 2022-11-29
+reviewers:
+  - "@aojea"
+  - "@JoelSpeed"
+  - "@mdbooth"
+  - "@khenidak"
+  - "@thockin"
+approvers:
+  - "@thockin"
+  - TBD from sig-cloud-provider
+
+see-also:
+  - "https://github.com/kubernetes/enhancements/issues/1664"
+  - "https://github.com/kubernetes/enhancements/pull/1665"
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.27"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.27"
+  beta: "v1.28"
+  stable: "v1.29"
+
+# The following PRR answers are required at alpha release
+# List the feature gate name and the components for which it must be enabled
+feature-gates:
+  - name: CloudNodeIPs
+    components:
+      - kubelet
+      - cloud-controller-manager
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics:
diff --git a/keps/sig-network/3726-standard-application-protocols/README.md b/keps/sig-network/3726-standard-application-protocols/README.md
new file mode 100644
index 000000000000..86e31a583f89
--- /dev/null
+++ b/keps/sig-network/3726-standard-application-protocols/README.md
@@ -0,0 +1,294 @@
+# KEP-3726: Standard Application Protocols
+
+<!--
+This is the title of your KEP. Keep it short, simple, and descriptive. A good
+title can help communicate what the KEP is and should be considered as part of
+any review.
+-->
+
+<!--
+A table of contents is helpful for quickly jumping to sections of a KEP and for
+highlighting any additional information provided beyond the standard KEP
+template.
+
+Ensure the TOC is wrapped with
+  <code>&lt;!-- toc --&rt;&lt;!-- /toc --&rt;</code>
+tags, and then generate with `hack/update-toc.sh`.
+-->
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [New Standard Protocols](#new-standard-protocols)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Adding new protocols](#adding-new-protocols)
+  - [Followup work](#followup-work)
+  - [Documentation change](#documentation-change)
+  - [Test Plan](#test-plan)
+  - [Graduation Criteria](#graduation-criteria)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [x] (R) KEP approvers have approved the KEP status as `implementable`
+- [x] (R) Design details are appropriately documented
+- [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [x] e2e Tests for all Beta API Operations (endpoints)
+  - [x] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [x] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [x] (R) Graduation criteria is in place
+  - [x] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [x] (R) Production readiness review completed
+- [x] (R) Production readiness review approved
+- [x] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+There are cases where implementations implement different things for the same application protocol names. That has already caused issues for certain implementations to interoperate, with the possibility of more in the future. (See the example with GKE and Istio under [Motivation](#motivation))
+
+This KEP suggests a different description for the `AppProtocol` field and proposes to declare standard Kubernetes protocol names for common protocols that are not IANA service names.
+
+
+## Motivation
+
+The lack of direct support for specifying application protocols for ports and the widespread use of implementation-specific annotations to mitigate it had led Kubernetes to add the `AppProtocol` field to the port spec.
+
+While this is a good solution - we never came with recommended standards other than [IANA standard service names](https://www.iana.org/assignments/service-names) or a domain prefixed protocol.
+
+This loose definition has led us to;
+1. Have instances where implementations do different things for common protocols.
+2. Have no support for implementations interoperability with different domain prefixed protocols (or a mix domain prefixed and non prefixed protocol) for the same port.
+
+One good example for the first bullet is `HTTP2`. 
+* In GKE you can use `appProtocol: HTTP2` and it will describe HTTP2 over TLS (https://cloud.google.com/kubernetes-engine/docs/how-to/secure-gateway#load-balancer-tls).
+* While in Istio it will be h2c (HTTP2 over cleartext).
+
+That creates a problem where users with GKE and Istio in their cluster can have very different behaviors for the same `appProtocol` value.
+
+
+### Goals
+
+* Rephrase AppProtocol field description
+* Build consensus around how common (non IANA service names) should be implemented
+* Help the broader community and specifically implementations to interoperate better
+* Provide short and clear documentation for how AppProtocol values should be implemented
+  * Update the appProtocol user documentation respectively
+
+
+### Non-Goals
+
+* Validate appProtocol values
+* Monitor appProtocol implementations
+* Support multiple AppProtocols values for the same port to improve interoperability (suggested as a followup work though)
+
+
+## Proposal
+
+~~Kubernetes supports the `appProtocol` field to provide a way to specify an application protocol for each Service port.~~
+
+Kubernetes `appProtocol` field is used as a hint for implementations to configure the protocol used between the implementation and the application it exposes.
+
+The [documentation](https://kubernetes.io/docs/concepts/services-networking/service/#application-protocol) for this field says that:
+
+```Values should either be IANA standard service names or domain prefixed names such as mycompany.com/my-custom-protocol.```
+
+This KEP proposes to declare standard Kubernetes protocol names for common protocols that are not IANA standard service names.
+
+Those common protocols will be well defined strings prefixed with ‘k8s.io’. 
+`k8s.io/h2c` as an example.
+
+### New Standard Protocols
+- 'k8s.io/http2'
+- 'k8s.io/grpc'
+- 'k8s.io/tcp'
+
+### Risks and Mitigations
+
+There are no real “risks”, primary concerns are:
+1. End users will not migrate to `k8s.io/<>` values
+2. It will take long time to implementations align with the new standards
+3. We have no easy way to monitor who is aligned and who is not
+
+
+## Design Details
+
+At first, the collection of standard protocols is going to live in `ServicePort` and `EndpointPort` as part of the AppProtocol description.
+
+We might revisit this decision in the future and suggest an alternative location based on the number of standard protocols we support.
+
+Proposed changes to the API spec:
+
+```go
+type ServicePort struct {
+  ... 
+  ...
+
+  // Used as a hint for implementations to
+  // configure the protocol used between the
+  // implementation and the application it exposes.
+  // This field follows standard Kubernetes label syntax.
+  // Valid values are either:
+  //
+  // * Un-prefixed protocol names - reserved for IANA standard service names (as per
+  // RFC-6335 and https://www.iana.org/assignments/service-names).
+  //
+  // * Kubernetes standard names:
+  //   * 'k8s.io/http2' - http2 over cleartext, aka 'h2c'. https://www.rfc-editor.org/rfc/rfc7540
+  //   * 'k8s.io/grpc' - grpc traffic - see https://github.com/grpc/grpc/blob/v1.51.1/doc/PROTOCOL-HTTP2.md
+  //   * 'k8s.io/tcp' - plain tcp traffic
+  //
+  // * Other protocols should use prefixed names such as
+  // mycompany.com/my-custom-protocol.
+  // +optional
+  AppProtocol *string
+```
+
+same wording for type `EndpointPort`
+
+### Adding new protocols
+
+In order to be included in the collection, a new protocol must:
+* Not be an [IANA standard service name](https://www.iana.org/assignments/service-names)
+* Run on top of L4 protocol supported by Kubernetes Service
+* Be supported in two or more implementations
+* Be well defined and broadly used
+
+<<[UNRESOLVED sig-network ]>>
+
+if we require two implementations to support a protocol before we include it in the standard collection (i.e k8s.io prefixed collection) we create a situation where we force them to support their own domain prefixed values and k8s.io-prefixed values, have their users migrate to the k8s.io values once they are included, and also the k8s.io ones might end up not be quite the same definition as the implementation specific ones (as we see in the GKE & Istio example).
+
+The proposed followup work might address this problem also when we turn the field into a list
+
+<<[/UNRESOLVED]>>
+
+
+### Followup work
+To support implementations interoperability with different domain prefixed protocols (or a mix domain prefixed and non prefixed protocol) for the same port we need to turn `AppProtocol` to a list.
+
+It is likely to be an API change but design details TBD.
+
+### Documentation change
+
+[kubernetes website](https://github.com/kubernetes/website/blob/main/content/en/docs/concepts/services-networking/service.md#application-protocol) will be changed accordingly
+
+### Test Plan
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+### Graduation Criteria
+
+### Upgrade / Downgrade Strategy
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+### Version Skew Strategy
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+## Production Readiness Review Questionnaire
+
+
+### Feature Enablement and Rollback
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+
+### Rollout, Upgrade and Rollback Planning
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+### Monitoring Requirements
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+### Dependencies
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+### Scalability
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+### Troubleshooting
+
+N/A
+
+This KEP does not plan to change code, just documentation.
+
+## Implementation History
+
+N/A
+
+## Drawbacks
+
+* The collection of the standard protocols can become stale fairly quick when new protocols are implemented before we decide to declare them as part of k8s.io common collection. That can lead to a the current state again where implementations already implement support without a prefix (although they should not) OR with a domain prefix.
+
+
+## Alternatives
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
diff --git a/keps/sig-network/3726-standard-application-protocols/kep.yaml b/keps/sig-network/3726-standard-application-protocols/kep.yaml
new file mode 100644
index 000000000000..c7790c3781a0
--- /dev/null
+++ b/keps/sig-network/3726-standard-application-protocols/kep.yaml
@@ -0,0 +1,34 @@
+title: KEP Template
+kep-number: 3726
+authors:
+  - "@LiorLieberman"
+owning-sig: sig-network
+
+status: provisional
+creation-date: 2023-01-11
+reviewers:
+  - "@aojea"
+  - "@robscott"
+  - "@howardjohn"
+  - "@danwinship"
+approvers:
+  - "@thockin"
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: stable
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.27"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  stable: "v1.27"
+
+see-also: []
+replaces: []
+
+feature-gates: []
+disable-supported: false
+metrics: []
diff --git a/keps/sig-node/2133-kubelet-credential-providers/README.md b/keps/sig-node/2133-kubelet-credential-providers/README.md
index 1e3b95777169..3f7ac42d3a79 100644
--- a/keps/sig-node/2133-kubelet-credential-providers/README.md
+++ b/keps/sig-node/2133-kubelet-credential-providers/README.md
@@ -48,10 +48,10 @@ Items marked with (R) are required *prior to targeting to a milestone / release*
 - [X] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input
 - [X] (R) Graduation criteria is in place
 - [X] (R) Production readiness review completed
-- [ ] Production readiness review approved
-- [ ] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
-- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+- [X] Production readiness review approved
+- [X] "Implementation History" section is up-to-date for milestone
+- [X] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [X] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 <!--
 **Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
@@ -602,6 +602,8 @@ No.
 - 2020-11-11: PR introducing this feature at alpha stage was merged https://github.com/kubernetes/kubernetes/pull/94196
 - 2021-07-02: PR improving concurrency and cache for kubelet credential provider was merged https://github.com/kubernetes/kubernetes/pull/102168
 - 2021-09-27: PR adding metrics for kubelet credential provider was merged https://github.com/kubernetes/kubernetes/pull/102802
+- v1.26: e2e tests added authenticating to GCR
+- v1.26: v1 APIs added and feature graduated to GA
 
 ## Drawbacks
 
diff --git a/keps/sig-node/2133-kubelet-credential-providers/kep.yaml b/keps/sig-node/2133-kubelet-credential-providers/kep.yaml
index a5541dda1f0e..9f34b72bcb61 100644
--- a/keps/sig-node/2133-kubelet-credential-providers/kep.yaml
+++ b/keps/sig-node/2133-kubelet-credential-providers/kep.yaml
@@ -7,7 +7,7 @@ owning-sig: sig-node
 participating-sigs:
   - sig-auth
   - sig-cloud-provider
-status: implementable
+status: implemented
 creation-date: 2020-11-12
 reviewers:
   - "@cheftako"
diff --git a/keps/sig-node/2238-liveness-probe-grace-period/README.md b/keps/sig-node/2238-liveness-probe-grace-period/README.md
index bd82105a4283..0ba7126a6caa 100644
--- a/keps/sig-node/2238-liveness-probe-grace-period/README.md
+++ b/keps/sig-node/2238-liveness-probe-grace-period/README.md
@@ -18,7 +18,8 @@
   - [Graduation Criteria](#graduation-criteria)
     - [Alpha (1.21)](#alpha-121)
     - [Beta (1.22)](#beta-122)
-    - [Graduation (1.27)](#graduation-127)
+    - [GA (1.27)](#ga-127)
+    - [Removal of feature flag (1.29)](#removal-of-feature-flag-129)
   - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
   - [Version Skew Strategy](#version-skew-strategy)
 - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
@@ -251,9 +252,13 @@ Unit buckets need to be added to `TestKillContainer` to test the [various scenar
 
 _Below graduation criteria are tentative._
 
-#### Graduation (1.27)
+#### GA (1.27)
 
-- Feature flag is removed, feature is graduated (1.27).
+- Graduate to GA
+
+#### Removal of feature flag (1.29)
+
+- Feature flag is removed
 
 ### Upgrade / Downgrade Strategy
 
@@ -581,7 +586,8 @@ _This section must be completed when targeting beta graduation to a release._
   ([kubernetes/kubernetes#99375](https://github.com/kubernetes/kubernetes/pull/99375))
 - 2021-07-13: Beta updates ([k/k#103168](https://github.com/kubernetes/kubernetes/pull/103168))
 - 2021-07-13: Add validation ([k/k#103245](https://github.com/kubernetes/kubernetes/pull/103245))
-- 1.25 cycle: default feature flag to true ([k/k#108541](https://github.com/kubernetes/kubernetes/pull/108541))
+- 2022-07-30: 1.25 cycle: default feature flag to true ([k/k#108541](https://github.com/kubernetes/kubernetes/pull/108541))
+- Default to GA in 1.27 [k/k#114307](https://github.com/kubernetes/kubernetes/pull/114307)
 
 ## Drawbacks
 
diff --git a/keps/sig-node/2400-node-swap/kep.yaml b/keps/sig-node/2400-node-swap/kep.yaml
index 1d0410b44be3..92885370438e 100644
--- a/keps/sig-node/2400-node-swap/kep.yaml
+++ b/keps/sig-node/2400-node-swap/kep.yaml
@@ -23,13 +23,13 @@ stage: alpha
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.26"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.22"
-  beta: "v1.27"
-  stable: "v1.28"
+  beta: "v1.28"
+  stable: "v1.30"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
diff --git a/keps/sig-node/277-ephemeral-containers/kep.yaml b/keps/sig-node/277-ephemeral-containers/kep.yaml
index d46e6ffa15c2..17df83c429bf 100644
--- a/keps/sig-node/277-ephemeral-containers/kep.yaml
+++ b/keps/sig-node/277-ephemeral-containers/kep.yaml
@@ -6,7 +6,7 @@ owning-sig: sig-node
 participating-sigs:
   - sig-auth
   - sig-node
-status: implementable
+status: implemented
 creation-date: 2019-02-12
 reviewers:
   - "@dchen1107"
diff --git a/keps/sig-node/3063-dynamic-resource-allocation/README.md b/keps/sig-node/3063-dynamic-resource-allocation/README.md
index d3c81db95bd2..95c961c71fc3 100644
--- a/keps/sig-node/3063-dynamic-resource-allocation/README.md
+++ b/keps/sig-node/3063-dynamic-resource-allocation/README.md
@@ -2024,22 +2024,36 @@ message NodePrepareResourceResponse {
 }
 ```
 
-CRI protocol MUST be extended for this purpose, e.g. list of CDI
-device ids should be added to the CRI Device structure:
+CRI protocol MUST be extended for this purpose:
 
+ * CDIDevice structure should be added to the CRI specification
 ```protobuf
-// Device specifies a host device to mount into a container.
-message Device {
-    ...
-    string permissions = 3;
-    // Set of fully qualified CDI device names in the following
-    // format: <kind>=<name>,
-    //  where
-    //    kind (string) is a device vendor identifier
-    //    name (string) is a device name
+// CDIDevice specifies a CDI device information.
+message CDIDevice {
+    // Fully qualified CDI device name
     // for example: vendor.com/gpu=gpudevice1
-    // see more details in the [CDI specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md)
-    repeated string cdi_device = 4;
+    // see more details in the CDI specification:
+    // https://github.com/container-orchestrated-devices/container-device-interface/blob/main/SPEC.md
+    string name = 1;
+}
+```
+ * CDI devices should be added to the ContainerConfig structure:
+```protobuf
+// ContainerConfig holds all the required and optional fields for creating a
+// container.
+message ContainerConfig {
+    // Metadata of the container. This information will uniquely identify the
+    // container, and the runtime should leverage this to ensure correct
+    // operation. The runtime may also use this information to improve UX, such
+    // as by constructing a readable name.
+    ContainerMetadata metadata = 1 ;
+    // Image to use.
+    ImageSpec image = 2;
+    // Command to execute (i.e., entrypoint for docker)
+    repeated string command = 3;
+...
+    // CDI devices for the container.
+    repeated CDIDevice cdi_devices = 17;
 }
 ```
 
diff --git a/keps/sig-node/3673-kubelet-parallel-image-pull-limit/README.md b/keps/sig-node/3673-kubelet-parallel-image-pull-limit/README.md
new file mode 100644
index 000000000000..372b40cbc331
--- /dev/null
+++ b/keps/sig-node/3673-kubelet-parallel-image-pull-limit/README.md
@@ -0,0 +1,861 @@
+<!--
+**Note:** When your KEP is complete, all of these comment blocks should be removed.
+
+To get started with this template:
+
+- [ ] **Pick a hosting SIG.**
+  Make sure that the problem space is something the SIG is interested in taking
+  up. KEPs should not be checked in without a sponsoring SIG.
+- [ ] **Create an issue in kubernetes/enhancements**
+  When filing an enhancement tracking issue, please make sure to complete all
+  fields in that template. One of the fields asks for a link to the KEP. You
+  can leave that blank until this KEP is filed, and then go back to the
+  enhancement and add the link.
+- [ ] **Make a copy of this template directory.**
+  Copy this template into the owning SIG's directory and name it
+  `NNNN-short-descriptive-title`, where `NNNN` is the issue number (with no
+  leading-zero padding) assigned to your enhancement above.
+- [ ] **Fill out as much of the kep.yaml file as you can.**
+  At minimum, you should fill in the "Title", "Authors", "Owning-sig",
+  "Status", and date-related fields.
+- [ ] **Fill out this file as best you can.**
+  At minimum, you should fill in the "Summary" and "Motivation" sections.
+  These should be easy if you've preflighted the idea of the KEP with the
+  appropriate SIG(s).
+- [ ] **Create a PR for this KEP.**
+  Assign it to people in the SIG who are sponsoring this process.
+- [ ] **Merge early and iterate.**
+  Avoid getting hung up on specific details and instead aim to get the goals of
+  the KEP clarified and merged quickly. The best way to do this is to just
+  start with the high-level sections and fill out details incrementally in
+  subsequent PRs.
+
+Just because a KEP is merged does not mean it is complete or approved. Any KEP
+marked as `provisional` is a working document and subject to change. You can
+denote sections that are under active debate as follows:
+
+```
+<<[UNRESOLVED optional short context or usernames ]>>
+Stuff that is being argued.
+<<[/UNRESOLVED]>>
+```
+
+When editing KEPS, aim for tightly-scoped, single-topic PRs to keep discussions
+focused. If you disagree with what is already in a document, open a new PR
+with suggested changes.
+
+One KEP corresponds to one "feature" or "enhancement" for its whole lifecycle.
+You do not need a new KEP to move from beta to GA, for example. If
+new details emerge that belong in the KEP, edit the KEP. Once a feature has become
+"implemented", major changes should get new KEPs.
+
+The canonical place for the latest set of instructions (and the likely source
+of this file) is [here](/keps/NNNN-kep-template/README.md).
+
+**Note:** Any PRs to move a KEP to `implementable`, or significant changes once
+it is marked `implementable`, must be approved by each of the KEP approvers.
+If none of those approvers are still appropriate, then changes to that list
+should be approved by the remaining approvers and/or the owning SIG (or
+SIG Architecture for cross-cutting KEPs).
+-->
+# KEP-3673: Kubelet limit of Parallel Image Pulls
+
+<!--
+This is the title of your KEP. Keep it short, simple, and descriptive. A good
+title can help communicate what the KEP is and should be considered as part of
+any review.
+-->
+
+<!--
+A table of contents is helpful for quickly jumping to sections of a KEP and for
+highlighting any additional information provided beyond the standard KEP
+template.
+
+Ensure the TOC is wrapped with
+  <code>&lt;!-- toc --&rt;&lt;!-- /toc --&rt;</code>
+tags, and then generate with `hack/update-toc.sh`.
+-->
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [QPS/burst limits on kubelet are confusing](#qpsburst-limits-on-kubelet-are-confusing)
+  - [No way to limit the number of inflight image pulls](#no-way-to-limit-the-number-of-inflight-image-pulls)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [User Stories (Optional)](#user-stories-optional)
+    - [Story 1](#story-1)
+  - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
+    - [Beta](#beta)
+    - [GA](#ga)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+<!--
+This section is incredibly important for producing high-quality, user-focused
+documentation such as release notes or a development roadmap. It should be
+possible to collect this information before implementation begins, in order to
+avoid requiring implementors to split their attention between writing release
+notes and implementing the feature itself. KEP editors and SIG Docs
+should help to ensure that the tone and content of the `Summary` section is
+useful for a wide audience.
+
+A good summary is probably at least a paragraph in length.
+
+Both in this section and below, follow the guidelines of the [documentation
+style guide]. In particular, wrap lines to a reasonable length, to make it
+easier for reviewers to cite specific portions, and to minimize diff churn on
+updates.
+
+[documentation style guide]: https://github.com/kubernetes/community/blob/master/contributors/guide/style-guide.md
+-->
+
+This KEP proposes adding to kubelet a node-level limit on the number of parallel image pulls.
+
+## Motivation
+
+<!--
+This section is for explicitly listing the motivation, goals, and non-goals of
+this KEP.  Describe why the change is important and the benefits to users. The
+motivation section can optionally provide links to [experience reports] to
+demonstrate the interest in a KEP within the wider Kubernetes community.
+
+[experience reports]: https://github.com/golang/go/wiki/ExperienceReports
+-->
+
+### QPS/burst limits on kubelet are confusing
+Currently kubelet limits image pulls with QPS and burst, but they are confusing as they only limit the number of requests sent to container runtime, and does not consider the number of inflight image pulls. In other words, even a small QPS is set, there still could be many image pulls in progress in parallel, if each pull takes a long time. See [issue #112044](https://github.com/kubernetes/kubernetes/issues/112044) as an example.
+
+
+### No way to limit the number of inflight image pulls
+A user might want to put a limit on the number of images being pulled at the same time to avoid burden on disk IO or consuming too much network bandwidth. However, currently neither kubelet or containerd limits the number of inflight image pulls.
+
+On kubelet, as mentioned above, the QPS/burst limit does not take the in-progress pulls into account. On containerd, there is only [a per-image limit](https://github.com/containerd/containerd/blob/8e787543deede10f372b0e16ad5c07790fc680b6/pkg/cri/config/config.go#L299-L300), which only limits the number of parallel layer downloading for each image, but potentially allows unlimited number downloads overall.
+
+As a side node, cri-o has a configuration to limit the maximum layer downloads in-progress.
+
+
+### Goals
+
+<!--
+List the specific goals of the KEP. What is it trying to achieve? How will we
+know that this has succeeded?
+-->
+
+Adding a node-level limit of parallel image pulls to kubelet. This limit will limit the maximum number of images being pulled in parallel. Any image pull request beyond the limit will be blocked until one image pull finishes.
+
+
+### Non-Goals
+
+<!--
+What is out of scope for this KEP? Listing non-goals helps to focus discussion
+and make progress.
+-->
+
+* Limiting the number of image layers being downloaded at the same time.
+* Prioritizing image pulls in any way.
+* Using the number of inflight image pulls as a signal to direct pod scheduling.
+
+## Proposal
+
+<!--
+This is where we get down to the specifics of what the proposal actually is.
+This should have enough detail that reviewers can understand exactly what
+you're proposing, but should not include things like API designs or
+implementation. What is the desired outcome and how do we measure success?.
+The "Design Details" section below is for the real
+nitty-gritty.
+-->
+* A new integer field `maxParallelImagePulls` will be added to Kubelet Configuration. `maxParallelImagePulls` is the maximum number of inflight image pulls.
+* The default value of `maxParallelImagePulls` will be set to 0, which applies no limit on parallel pulls, the same as current behavior.
+* When `serialize-image-pulls` is true, `maxParallelImagePulls` will be ignored. A warning message will be logged by kubelet if `serialize-image-pulls` is true and `maxParallelImagePulls` is larger than 1.
+
+### User Stories (Optional)
+
+<!--
+Detail the things that people will be able to do if this KEP is implemented.
+Include as much detail as possible so that people can understand the "how" of
+the system. The goal here is to make this feel real for users without getting
+bogged down.
+-->
+
+#### Story 1
+A kubernetes user wants to enable parallel image pulls to reduce workload startup latency, but they don't want too many images to be pulled at the same time, as that might burden the disk IO and therefore has a negative impact on disk performance, or might use too much network bandwidth. By setting a proper `maxParallelImagePulls`, they will have a better control over the parallelism and therefore avoid overwhelming the disk.
+
+
+### Notes/Constraints/Caveats (Optional)
+
+<!--
+What are the caveats to the proposal?
+What are some important details that didn't come across above?
+Go in to as much detail as necessary here.
+This might be a good place to talk about core concepts and how they relate.
+-->
+
+### Risks and Mitigations
+
+<!--
+What are the risks of this proposal, and how do we mitigate? Think broadly.
+For example, consider both security and how this will impact the larger
+Kubernetes ecosystem.
+
+How will security be reviewed, and by whom?
+
+How will UX be reviewed, and by whom?
+
+Consider including folks who also work outside the SIG or subproject.
+-->
+
+Since this feature is purely in-memory, if kubelet restarts, it will lose track of the image pulls before the Kubelet restart. It seems that even on non-graceful shutdown of kubelet, in-flight image pulls are cancelled, but there might be some corner cases when previous image pulls may still be performed after restart, exceeding the total of `maxParallelImagePulls`.
+
+## Design Details
+
+<!--
+This section should contain enough information that the specifics of your
+change are understandable. This may include API specs (though not always
+required) or even code snippets. If there's any ambiguity about HOW your
+proposal will be implemented, this is the place to discuss them.
+-->
+
+* The implementation will be similar to the existing `serialImagePuller`. More specifically:
+  * `maxParallelImagePulls` will be passed to [parallelImagePuller](https://github.com/kubernetes/kubernetes/blob/054ceab58d803df0c4910fa722138f0ed2d94701/pkg/kubelet/images/puller.go#L41), and `parallelImagePuller` will create a channel of the size of `maxParallelImagePulls`.
+  * `parallelImagePuller.pullImage` will simply try to send the pull request to the channel, and will be blocked if the channel is full.
+  * A go routine will keep processing the requests in the channel.
+
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+The goal is to ensure that we don't accept enhancements with inadequate testing.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+[ ] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+None.
+
+##### Unit tests
+
+<!--
+In principle every added code should have complete unit test coverage, so providing
+the exact set of tests will not bring additional value.
+However, if complete unit test coverage is not possible, explain the reason of it
+together with explanation why this is acceptable.
+-->
+
+<!--
+Additionally, for Alpha try to enumerate the core package you will be touching
+to implement this enhancement and provide the current unit coverage for those
+in the form of:
+- <package>: <date> - <current test coverage>
+The data can be easily read from:
+https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit
+
+This can inform certain test coverage improvements that we want to do before
+extending the production code to implement this enhancement.
+-->
+
+New unit test will be added to image_manager_test.go.
+- `k8s.io/kubernetes/pkg/kubelet/images/puller.go`: `01/05/2023` - `100.0`
+
+##### Integration tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+-->
+
+See e2e tests section below.
+
+- <test>: <link to test coverage>
+
+##### e2e tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+-->
+
+A new node_e2e test with  `serialize-image-pulls==false` will be added to make sure that when maxParallelImagePulls is reached, all further image pulls will be blocked.
+
+- <test>: <link to test coverage>
+
+### Graduation Criteria
+
+#### Alpha
+- Initial e2e tests completed and enabled
+
+#### Beta
+- Gather feedback from developers and surveys
+
+#### GA
+- Gather feedback from real-world usage from kubernetes vendors.
+- Allowing time for feedback.
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Define graduation milestones.
+
+These may be defined in terms of API maturity, [feature gate] graduations, or as
+something else. The KEP should keep this high-level with a focus on what
+signals will be looked at to determine graduation.
+
+Consider the following in developing the graduation criteria for this enhancement:
+- [Maturity levels (`alpha`, `beta`, `stable`)][maturity-levels]
+- [Feature gate][feature gate] lifecycle
+- [Deprecation policy][deprecation-policy]
+
+Clearly define what graduation means by either linking to the [API doc
+definition](https://kubernetes.io/docs/concepts/overview/kubernetes-api/#api-versioning)
+or by redefining what graduation means.
+
+In general we try to use the same stages (alpha, beta, GA), regardless of how the
+functionality is accessed.
+
+[feature gate]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[maturity-levels]: https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md#alpha-beta-and-stable-versions
+[deprecation-policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
+
+Below are some examples to consider, in addition to the aforementioned [maturity levels][maturity-levels].
+
+#### Alpha
+
+- Feature implemented behind a feature flag
+- Initial e2e tests completed and enabled
+
+#### Beta
+
+- Gather feedback from developers and surveys
+- Complete features A, B, C
+- Additional tests are in Testgrid and linked in KEP
+
+#### GA
+
+- N examples of real-world usage
+- N installs
+- More rigorous forms of testing—e.g., downgrade tests and scalability tests
+- Allowing time for feedback
+
+**Note:** Generally we also wait at least two releases between beta and
+GA/stable, because there's no opportunity for user feedback, or even bug reports,
+in back-to-back releases.
+
+**For non-optional features moving to GA, the graduation criteria must include
+[conformance tests].**
+
+[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
+
+#### Deprecation
+
+- Announce deprecation and support policy of the existing flag
+- Two versions passed since introducing the functionality that deprecates the flag (to address version skew)
+- Address feedback on usage/changed behavior, provided on GitHub issues
+- Deprecate the flag
+-->
+
+### Upgrade / Downgrade Strategy
+
+<!--
+If applicable, how will the component be upgraded and downgraded? Make sure
+this is in the test plan.
+
+Consider the following in developing an upgrade/downgrade strategy for this
+enhancement:
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to maintain previous behavior?
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to make use of the enhancement?
+-->
+N/A
+
+### Version Skew Strategy
+
+<!--
+If applicable, how will the component handle version skew with other
+components? What are the guarantees? Make sure this is in the test plan.
+
+Consider the following in developing a version skew strategy for this
+enhancement:
+- Does this enhancement involve coordinating behavior in the control plane and
+  in the kubelet? How does an n-2 kubelet without this feature available behave
+  when this feature is used?
+- Will any other components on the node change? For example, changes to CSI,
+  CRI or CNI may require updating that component before the kubelet.
+-->
+N/A
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+
+Documentation is available on [feature gate lifecycle] and expectations, as
+well as the [existing list] of feature gates.
+
+[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
+-->
+
+- [X] Other
+  - Describe the mechanism: The feature will be enabled when the kubelet config field `maxParallelImagePulls` is set to a non-zero value on kubelet, can be disabled by setting `maxParallelImagePulls` to 0 and restarting kubelet.
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+    - No
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled).
+    - Yes, it requires restarting kubelet.
+
+###### Does enabling the feature change any default behavior?
+
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+
+The change itself will not change any default behavior. The default behavior will only be changed when the user explicityly sets `maxParallelImagePulls` to a non-zero value.
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+Feature gates are typically disabled by setting the flag to `false` and
+restarting the component. No other changes should be necessary to disable the
+feature.
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+
+The code change itself cannot be rolled back. But a user can roll back to the existing default behavior by setting `maxParallelImagePulls` to 0, or not setting it and letting it default to 0.
+
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+Nothing will happen.
+
+###### Are there any tests for feature enablement/disablement?
+
+This feature is purely in-memory, so such test isn't really needed.
+
+<!--
+The e2e framework does not currently support enabling or disabling feature
+gates. However, unit tests in each component dealing with managing data, created
+with and without the feature, are necessary. At the very least, think about
+conversion tests if API types are being modified.
+
+Additionally, for features that are introducing a new API field, unit tests that
+are exercising the `switch` of feature gate itself (what happens if I disable a
+feature gate after having objects written with the new field) are also critical.
+You can take a look at one potential example of such test in:
+https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
+-->
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason:
+- [ ] API .status
+  - Condition name:
+  - Other field:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+No.
+
+###### Will enabling / using this feature result in introducing new API types?
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+No.
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+No.
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+No.
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+No.
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+No.
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+## Drawbacks
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
+
+## Infrastructure Needed (Optional)
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
diff --git a/keps/sig-node/3673-kubelet-parallel-image-pull-limit/kep.yaml b/keps/sig-node/3673-kubelet-parallel-image-pull-limit/kep.yaml
new file mode 100644
index 000000000000..bb44ef069745
--- /dev/null
+++ b/keps/sig-node/3673-kubelet-parallel-image-pull-limit/kep.yaml
@@ -0,0 +1,26 @@
+title: KEP Template
+kep-number: 3673
+authors:
+  - "@pacoxu"
+  - "@Ruiwen-Zhao"
+owning-sig: sig-node
+status: implementable
+creation-date: 2023-01-05
+reviewers:
+  - "@SergeyKanzhelev"
+approvers:
+  - "@mrunalp"
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.27"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.27"
+  beta: "v1.28"
+  stable: "v1.29"
diff --git a/keps/sig-node/3675-resource-plugin-manager/CCIAllocSeqAlpha.jpg b/keps/sig-node/3675-resource-plugin-manager/CCIAllocSeqAlpha.jpg
new file mode 100644
index 000000000000..866db06824c0
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/CCIAllocSeqAlpha.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/CCIAllocationSequenceDiagram.jpg b/keps/sig-node/3675-resource-plugin-manager/CCIAllocationSequenceDiagram.jpg
new file mode 100644
index 000000000000..579339521ce1
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/CCIAllocationSequenceDiagram.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/CCIArchAlpha.jpg b/keps/sig-node/3675-resource-plugin-manager/CCIArchAlpha.jpg
new file mode 100644
index 000000000000..84df5a2282dc
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/CCIArchAlpha.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/CCIRemovalSequenceDiagram.jpg b/keps/sig-node/3675-resource-plugin-manager/CCIRemovalSequenceDiagram.jpg
new file mode 100644
index 000000000000..0c6c1f15ec7e
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/CCIRemovalSequenceDiagram.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/CCIRemoveSeqAlpha.jpg b/keps/sig-node/3675-resource-plugin-manager/CCIRemoveSeqAlpha.jpg
new file mode 100644
index 000000000000..ba56172081bb
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/CCIRemoveSeqAlpha.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/CCITroubleshooting.jpg b/keps/sig-node/3675-resource-plugin-manager/CCITroubleshooting.jpg
new file mode 100644
index 000000000000..a8e8f92b5b53
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/CCITroubleshooting.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/ExternallyManagedResourceExtension.jpg b/keps/sig-node/3675-resource-plugin-manager/ExternallyManagedResourceExtension.jpg
new file mode 100644
index 000000000000..18e5fe559bb6
Binary files /dev/null and b/keps/sig-node/3675-resource-plugin-manager/ExternallyManagedResourceExtension.jpg differ
diff --git a/keps/sig-node/3675-resource-plugin-manager/README.md b/keps/sig-node/3675-resource-plugin-manager/README.md
new file mode 100644
index 000000000000..82110162626f
--- /dev/null
+++ b/keps/sig-node/3675-resource-plugin-manager/README.md
@@ -0,0 +1,1319 @@
+<!--
+**Note:** When your KEP is complete, all of these comment blocks should be removed.
+
+To get started with this template:
+
+- [X] **Pick a hosting SIG.**
+  sig-node
+  Make sure that the problem space is something the SIG is interested in taking
+  up. KEPs should not be checked in without a sponsoring SIG.
+- [X] **Create an issue in kubernetes/enhancements**
+  When filing an enhancement tracking issue, please make sure to complete all
+  fields in that template. One of the fields asks for a link to the KEP. You
+  can leave that blank until this KEP is filed, and then go back to the
+  enhancement and add the link.
+  Number 3675
+- [X] **Make a copy of this template directory.**
+  Copy this template into the owning SIG's directory and name it
+  `NNNN-short-descriptive-title`, where `NNNN` is the issue number (with no
+  leading-zero padding) assigned to your enhancement above.
+- [ ] **Fill out as much of the kep.yaml file as you can.**
+  At minimum, you should fill in the "Title", "Authors", "Owning-sig",
+  "Status", and date-related fields.
+- [ ] **Fill out this file as best you can.**
+  At minimum, you should fill in the "Summary" and "Motivation" sections.
+  These should be easy if you've preflighted the idea of the KEP with the
+  appropriate SIG(s).
+- [ ] **Create a PR for this KEP.**
+  Assign it to people in the SIG who are sponsoring this process.
+- [ ] **Merge early and iterate.**
+  Avoid getting hung up on specific details and instead aim to get the goals of
+  the KEP clarified and merged quickly. The best way to do this is to just
+  start with the high-level sections and fill out details incrementally in
+  subsequent PRs.
+
+Just because a KEP is merged does not mean it is complete or approved. Any KEP
+marked as `provisional` is a working document and subject to change. You can
+denote sections that are under active debate as follows:
+
+```
+<<[UNRESOLVED optional short context or usernames ]>>
+Stuff that is being argued.
+<<[/UNRESOLVED]>>
+```
+
+When editing KEPS, aim for tightly-scoped, single-topic PRs to keep discussions
+focused. If you disagree with what is already in a document, open a new PR
+with suggested changes.
+
+One KEP corresponds to one "feature" or "enhancement" for its whole lifecycle.
+You do not need a new KEP to move from beta to GA, for example. If
+new details emerge that belong in the KEP, edit the KEP. Once a feature has become
+"implemented", major changes should get new KEPs.
+
+The canonical place for the latest set of instructions (and the likely source
+of this file) is [here](/keps/NNNN-kep-template/README.md).
+
+**Note:** Any PRs to move a KEP to `implementable`, or significant changes once
+it is marked `implementable`, must be approved by each of the KEP approvers.
+If none of those approvers are still appropriate, then changes to that list
+should be approved by the remaining approvers and/or the owning SIG (or
+SIG Architecture for cross-cutting KEPs).
+-->
+# KEP-3675: Pluggable Resource Management 
+Container Compute Interface (CCI) Driver Extension
+
+
+<!--
+This is the title of your KEP. Keep it short, simple, and descriptive. A good
+title can help communicate what the KEP is and should be considered as part of
+any review.
+-->
+
+<!--
+A table of contents is helpful for quickly jumping to sections of a KEP and for
+highlighting any additional information provided beyond the standard KEP
+template.
+
+Ensure the TOC is wrapped with
+  <code>&lt;!-- toc --&rt;&lt;!-- /toc --&rt;</code>
+tags, and then generate with `hack/update-toc.sh`.
+-->
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+    - [Alpha Goals](#alpha-goals)
+    - [Beta &amp; Post-Beta Goals](#beta--post-beta-goals)
+  - [Non-Goals](#non-goals)
+    - [Not In Scope for Alpha](#not-in-scope-for-alpha)
+- [Proposal](#proposal)
+  - [User Stories](#user-stories)
+    - [Custom workloads, such as HPC/AI/ML](#custom-workloads-such-as-hpcaiml)
+    - [Power optimization of workloads](#power-optimization-of-workloads)
+    - [Research of new resource management patterns within the cloud](#research-of-new-resource-management-patterns-within-the-cloud)
+    - [User-specific plugins](#user-specific-plugins)
+  - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+    - [Compute Specification Option](#compute-specification-option)
+      - [Example 1: Attributed-based resource specification](#example-1-attributed-based-resource-specification)
+      - [Example 2.: Policy-based resource specification](#example-2-policy-based-resource-specification)
+      - [Resource Manager Architecture](#resource-manager-architecture)
+      - [Allocation &amp; Container Removal Flow](#allocation--container-removal-flow)
+  - [Test Plan](#test-plan)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha to Beta](#alpha-to-beta)
+    - [Beta to GA](#beta-to-ga)
+    - [Deprecation](#deprecation)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+    - [Operational Requirements for Alpha:](#operational-requirements-for-alpha)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+- [Infrastructure Needed](#infrastructure-needed)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+<!--
+This section is incredibly important for producing high-quality, user-focused
+documentation such as release notes or a development roadmap. It should be
+possible to collect this information before implementation begins, in order to
+avoid requiring implementors to split their attention between writing release
+notes and implementing the feature itself. KEP editors and SIG Docs
+should help to ensure that the tone and content of the `Summary` section is
+useful for a wide audience.
+
+A good summary is probably at least a paragraph in length.
+
+Both in this section and below, follow the guidelines of the [documentation
+style guide]. In particular, wrap lines to a reasonable length, to make it
+easier for reviewers to cite specific portions, and to minimize diff churn on
+updates.
+
+[documentation style guide]: https://github.com/kubernetes/community/blob/master/contributors/guide/style-guide.md
+-->
+
+The authors have taken inspiration from the [CSI development](https://github.com/kubernetes/design-proposals-archive/blob/main/storage/container-storage-interface.md). 
+Kubernetes compute management is tightly integrated with the Kubelet and the 
+existing suite of resources managers including the topology manager, CPU manager, 
+memory manager, and device manager. While these managers have added functionality that 
+has addressed a varied set of use cases, they do present the community with several challenges.
+
+Adding a new capability to one of these managers is slow moving and difficult 
+due to their complex interactions, the level of prudence required given the potential
+impacts, the implied necessity to make the extension work in as many scenarios as
+possible, as well as the overhead heaped onto the sig-node maintainers. More 
+details on the challenges discussed with the community have been captured in 
+the [CPU Management Kubelet Use Cases & Current State document](https://docs.google.com/document/d/1U4jjRR7kw18Rllh-xpAaNTBcPsK5jl48ZAVo7KRqkJk).  
+
+As a result, adding optimizations necessary to support some more niche use cases, 
+to improve total cost of ownership for deployments with more exacting requirements 
+on resources or to provide a mechanism to progressively roll-out the benefits of 
+innovations related to newer CPU architectures is difficult and high cost to community resources.
+
+This proposal aims to address the challenges by introducing a new Container Compute 
+Interface (CCI). The CCI is conceptually equivalent to the CSI in that it will 
+delineate between the responsibility of the Kubelet and lower-level responsibility
+of compute focused plugins for resource assignment capabilities, such CPU and memory. 
+
+We are propsing an iterative implementation approach: in the first stage (alpha) we 
+propose to guarantee the exclusive operation of CCI Manager on cpu and memory resources
+through a feature gate and a requirement to have CPU Manager Policy as None. All pods will be handled 
+by the new CCI Manager. This will allow us to avoid state synchronization issues with cpu manager.
+To avoid the bootstrapping and driver-unavailability problems of CCI we propose to 
+use best-effort handling of those cases in the alpha state of the implementation augmented with additional logging. 
+
+In a later phase of the implementation we can provide a proper mechanism to handle driver unavailability cases. 
+Given the existing Kubelet CPU/memory manager capabilities and their usefulness 
+for many scenario, we would also like to consider if resource management for certain pods have to be handled without drivers and if we need 
+to associate specific pods with CCI Drivers (association mechanisms will be investigated in 
+later phase).
+
+The implication of CCI and the addition of the proposed CCI plugin manager 
+will be to allow compute capabilities such as CPU and memory to be managed outside
+of the Kubelet via pluggable CCI Drivers.  This will allow users to augment the
+current implementation of Kubelet managed resources with support for more complex
+use-cases without having to modify the Kubelet. The CCI extensions can coexist 
+with the existing CPU and memory allocation technique available to Kubernetes users today.
+
+These changes will allow the community to disaggregate the long-term 
+stability and advancement of the Kubelet from the task of improving the compute 
+resource management while keeping pace with the needs of 
+specialized use cases and the advancements in the compute vendor ecosystem.
+
+
+
+## Motivation
+
+<!--
+This section is for explicitly listing the motivation, goals, and non-goals of
+this KEP.  Describe why the change is important and the benefits to users. The
+motivation section can optionally provide links to [experience reports] to
+demonstrate the interest in a KEP within the wider Kubernetes community.
+
+[experience reports]: https://github.com/golang/go/wiki/ExperienceReports
+-->
+
+Users have varied workloads; the current set of available configurations
+for CPU, memory, and topology remain limited.  Additionally, the number of managers
+becoming internal to the Kubelet continues to increase; we should find a more 
+dynamic and pluggable way of handling these resources.  
+
+Operating systems work by allowing drivers and pluggable resources.  This includes various
+policies in how cpu, memory, and devices are allocated.  Kubernetes can be viewed as being 
+an the operating system of the cloud.  Allowing specialty modules to address the use cases
+directly, rather than continuing to add complexity by continuing to modify the kubelet, 
+will allow the greatest scope of abilities while halting continued increases of complexity 
+within the core of the Kubelet.
+
+Users would like to be able to address the following use cases:
+
+* The ability to allow vendors to release vendor-specific managers for their hardware
+  and provide an interface while not having vendor-specific code within Kubelet.
+* Differentiate between types of cores and memory. <br>
+  Note - Dynamic resouce allocation does this with regular resources today.  We seek to
+  extend this ability.
+* Differentiate between different configurations of cores and memory, for instance cores
+  designated as performance versus those designated as efficiency cores 
+* Have custom plugins to optimize for particular types of workloads.  These plugins
+  may be built for performance, power reduction, cpu savings, et cetera.  <br>
+  Note:  Currently, there are very limited sets of available topology policies.  Every
+  new policy must be approved and be lockstep with the Kuberenetes release process.
+* Be able to hot-plug and test new resource managers.  
+* Be able to remove some of the complexity with current setup and, over time, reduce
+  the amount of code contained within Kubelet.  Instead, build a library with specific
+  needs.
+* Have a faster path to desired changes, without potentially impacting the core of 
+  Kubernetes with every policy change. <br>
+  Note that current solutions have been cropping up to allow for resource management
+  outside the core Kubelet. These solutions require turning off the Kubelet functionality 
+  and overriding current Kubelet allocation. We should provide a path otherwise. Many of these solutions are custom to particular companies. Many are not open source. The community should give a process to allow this functionality in core kubelet at a granular level, same as we have in many systems, such as HPC or telco.
+* Leverage additional parts from the pod spec information which could help make allocation decisions
+without the need to query the k8s controlplane.
+* Be able to do research, with minimum toil, on new policies and resource management strategies
+
+This design will also use gRPC, which is used for many other pluggable components within Kubernetes.  
+The hope is that it will also, as a side effect, allow a path to simplify the Kubelet as it is 
+today and move the complexity into external plugins.
+
+### Goals
+
+<!--
+List the specific goals of the KEP. What is it trying to achieve? How will we
+know that this has succeeded?
+-->
+
+#### Alpha Goals
+
+1.	Provide Initial infrastructure to support CCI Driver Plugins and k8s Deployments requiring CCI Drivers after scheduling
+2.  Provide a feature gate to enable CCI Manager. The feature will require CPU Manager policy set to None.
+3.  Provide a CCI test driver which can demonstrate CCI Driver implementation requirementes and several resource management use-cases for cpu and memory.
+4.  Provide documntation of the CCI Manager and the provided test driver plus illustration of the currenlty covered use-cases.
+5.	Support seamless k8s start.
+6.	CCI Driver results (cpusets and memory affinity) are passed back to kubelet and allocated.
+7.	Handle resource management state for cpu and memory through CCI. 
+8.	Support proper fail mechanisms of Pods if CCI driver is not available - Error message + retry 
+9.	Continue cluster operation if CCI driver is not available
+10.  Podspecs:  Will be able to support current pod specs.  While there may be additional 
+extensibility in the future, the current pod specs will still work.
+
+
+#### Beta & Post-Beta Goals
+
+1.	Support standard pod deployments not requiring CCI Drivers through CPU & Memory Manager: all QoS types
+2.  Support hint providers for topology manager
+3.  Interoperability with Device Plugins, DRA, et cetera. 
+4.	Identify and provide a minimal in-cluster operational core (from existing CPU Manager, Memory Manager, Topology Manager) either through in-tree libraries or through further KEPs. 
+5.  Replace CCI policy with a component integrated in None and Static CPU Manager policies.
+6.  Gather feedback on the feature through surveys.
+7.	E2E testing including other components.
+8.  Adding support for e2e tests in testgrid.
+9.  Health identification mechanisms and automatic health-repair procedures for CCI Drivers.  
+8.  Optimize on-node resource usage (approaches to avoid waste of resources).
+
+
+### Non-Goals
+
+<!--
+What is out of scope for this KEP? Listing non-goals helps to focus discussion
+and make progress.
+-->
+
+* Break any existing use cases for topology manager, memory manager, cpu manager, or 
+device manager.
+* Creating any more latency than there is today for scheduling:  We should be 
+careful not to add any latency into scheduling over what exists today for default behavior.
+* CPU & Memory Management on scheduler side.
+* Optimal binpacking of resources (we will still consider if we can optimize resource utilization in Beta).
+
+
+## Proposal
+
+<!--
+This is where we get down to the specifics of what the proposal actually is.
+This should have enough detail that reviewers can understand exactly what
+you're proposing, but should not include things like API designs or
+implementation. What is the desired outcome and how do we measure success?.
+The "Design Details" section below is for the real
+nitty-gritty.
+-->
+
+
+### User Stories
+
+<!--
+Detail the things that people will be able to do if this KEP is implemented.
+Include as much detail as possible so that people can understand the "how" of
+the system. The goal here is to make this feel real for users without getting
+bogged down.
+-->
+
+
+#### Custom workloads, such as Telco/HPC/AI/ML
+
+Custom workloads often have a desire for a mix of types of cores..  For instance, a workload
+should be able to have some number of static cores and some number of shared cores.
+A node should be able to allow both types of cores, without having to have one setting
+or another, and be able to pull from these pulls accordingly.  Additionally, there is
+a need to have some high-priority cores for higher performance and other lower-priority
+cores for other less-sensitive parts of a workloads.  In these use cases, the workloads 
+may also have particular types of NUMA splits required.
+
+#### Power optimization of workloads
+
+Cores should be able to be quickly spun up or down according to resource requirements.
+Additionally, the nodes should be as densely packed as possible regrading the abliity to
+do core use.  There should also be the ability to choose between efficiency cores and
+performance cores within newer architectures, according to workload requirements.
+
+#### Research of new resource management patterns within the cloud
+
+There are a variety of modifiers that can be placed around cores.  Static cores,
+isolated cores, shared cores, efficiency cores, and performance cores are only the
+beginning of unexplored spaces.  Being able to experiment with various patterns in 
+research without having to be an expert in how to modify Kubelet and its multiple 
+internal managers is a decisive benefit to the research community.
+
+#### User-specific plugins
+
+A user may have very specific allocation patterns they require.  This sort of capability
+may be rare and not belong upstream in mainstream Kubernetes, but there should still
+be a simple way to allow users to meet their specific experiments.
+
+### Notes/Constraints/Caveats (Optional)
+
+<!--
+What are the caveats to the proposal?
+What are some important details that didn't come across above?
+Go in to as much detail as necessary here.
+This might be a good place to talk about core concepts and how they relate.
+-->
+
+### Risks and Mitigations
+
+<!--
+What are the risks of this proposal, and how do we mitigate? Think broadly.
+For example, consider both security and how this will impact the larger
+Kubernetes ecosystem.
+
+How will security be reviewed, and by whom?
+
+How will UX be reviewed, and by whom?
+
+Consider including folks who also work outside the SIG or subproject.
+-->
+
+## Design Details
+
+<!--
+This section should contain enough information that the specifics of your
+change are understandable. This may include API specs (though not always
+required) or even code snippets. If there's any ambiguity about HOW your
+proposal will be implemented, this is the place to discuss them.
+-->
+
+#### Compute Specification Option
+
+The CCI model combined with some of the capabilities introduced in the Dynamic 
+Resource Allocation (DRA) KEP [3], offers the ability to transition compute 
+resource allocation behavior from being a cluster-admin dominated configuration
+to one that allows users with precise compute requirements to articulate the
+compute attributes that they need. 
+
+In several domains such Streaming, Telco, HPC, and AI/ML, users require
+fine-grained compute resource control to maximize their key performance 
+indicators.  They need an API option that can be made available through
+upstream Kubernetes components and specific compute resource allocation 
+plugins.
+
+##### Example 1: Attributed-based resource specification
+
+One realization of such an API can be achieved by an attribute-based 
+mechanism for compute resources. The attribute-based compute resource 
+request model is a step towards improved semantic workload portability 
+that can be tried in a progressive manner with CCI. 
+
+In the following, there is an example of a novel approach that could be 
+fulfilled with the CCI architecture. This example presents a set of
+per-core attributes in a config-map form that enable precision beyond a policy-only based approach: 
+
+    # A resource request which consists of:
+    # 2 exclusive cores (no other processes running on them)
+    # 	* the cores shall be siblings
+    # 	* they shall run at 2.1 ghz
+    # 	* application assigned has the highest priority on the core
+    # 	* there are no IRQs on the core
+    # 6 exclusive cores(no other processes running on them)
+    # 4 shared cores(other processes can run on them) 
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+      name: example-compute-claim-parameters
+      namespace: default
+    data:
+      cores: |
+    # number of core type 1
+    2
+    # number of core type 2
+    6
+    # number of core type 3
+    4
+      compute_attributes: |
+        # Attributes of core type 1
+        exclusive, smt-sibling-required, 2.1Ghz, 1.0 priority, no-irq
+        # Attributes of core type 2
+        exclusive
+        # Attributes of core type 3
+        shared
+
+##### Example 2.: Policy-based resource specification
+
+Another option how a resource allocation configuration for Kubernetes
+Pods could be achieved is through a policy mechanism. In this case the
+policy will be applied to all Pods to be processed. The configuration
+of the policy can happen directly in a CCI Compute Resource Driver Plugin
+as input argument in the yaml deployment. This methodology can be used to
+realize methodologies similar to the static CPU manager policy or other policies.
+
+##### Resource Manager Architecture
+
+The extension consists of an optional CCI driver inside Pod spec which
+indicates if a CCI driver exists, then it shall process compute resources.
+The association betweed pod and CCI Driver can be also realised as a resource class 
+argument (will be considered as an option during the implementation). 
+
+In the alpha version of the Resource Manager applies driver plugins to manage resource
+allocations for Pods. Post-alpha we will consider if we need to explicitly mark which pods are to be handled by an allocation 
+plugin.  The overall architecture consists of a manager component – the Resource
+Manager inside the container manager, an optional Pod spec argument to identify which 
+Pods to be processed by the manager(post-alpha), a compute resource store – a component used to store
+the resource sets(cpusets, memory affinity configuration) for the resource plugins. The CCI Manager 
+is enabled via a feature gate. After turning on the feature gate all pod allocation requets for cpu 
+and memory will be handled by the CCI Manager.
+
+![image](CCIArchAlpha.jpg)<br>
+Figure 1: CCI Resource Manager Architecture inside Kubelet
+
+    Pod Spec “CCIDriverName Extension”
+    type PodSpec {
+       …
+    	 // CCIResourceDriver is an optional parameter which can identify if a Pod resource manager (cpu an memory)
+	    // has to be handled by a pluggable CCI resource driver 
+	    // This is an alpha field and requires enabling the
+    	 // CCIResourceManager feature gate.
+	    //
+    	 // This field is immutable.
+	    //
+	    // +featureGate=CCIResourceManager
+	    // +optional
+	    CCIResourceDriver string `js”n:”cciResourceDriver,omitem”ty” protob”f:”bytes,40,opt,name=cciResourceDri”er”`
+     …}
+
+The proposed extension relies on a new optional argument inside the Pod spec to `driverName`. 
+We use this optional argument to drive the allocation process. If the driver name is not provided,
+the Pod resources will be allocated through the standard CPU manager within t ubeletlet. An
+alternative association approach can be considered during the implementation which can avoid
+Pod spec changes.
+
+1. CCI Resource Manager<br>
+These changes will need a separate resource manager component inside the container 
+manager which will handle plugin management tasks, including plugin registration and 
+invocations of registered plugins.  These tasks will be managed via grpc for Pods who 
+require one or more resource management plugin(s) to get successfully allocated.  In 
+the alpha version we provide to entry points to plugins on admission,  addContainer 
+and removeContainer lifetime events (plugins have to implement Admit, AddResource 
+and RemoveResource handlers accordingly). 
+
+2. CCI Resource Store<br>
+The Resource Manager uses a store to keep track of resourcesets which can include 
+cpusets, memory, etc ..  allocated by CCI Drivers. This information is gathered 
+per container. The store will be used by a new cpu manager policy to synchronize 
+the state between cpu manager and pluggable CCI Drivers.  The store offers the 
+following interface to manage resource sets.
+
+        +AddResource(Pod, container, resourceset)
+        +RemoveResource(container)
+        +AvailablResources(): resourceset
+
+
+4. CCI Drivers Plugin Interface<br>
+The initial interface of resource management plugins is very simple and consists 
+of three functions:
+
+        +CCIAdmit(Pod, container, cci spec, available resources) : <err, resourceset>
+        +CCIAddContainerResource (Pod, container, cgroup, containerid): <err>
+        +CCIRemoveContainerResource (containerid): err
+
+`CCIAdmit` function provides information if a given container belonging to a
+Pod and having a specific CCI spec can be admitted to next stage of the allocation 
+pipeline. The admission will return a reserved resource-set or error. In case of 
+successful admission the resource set will be stored in the CCI Store and made 
+available to the cpu manager policy. In the case of failure the error is reported
+back to user and the Pod allocation is cancelled.
+
+`CCIAddContainerResource` function is called before container start with a given 
+Pod name and container name, cgroup of Pod and the container id. The driver than
+performs an allocation of the admitted resource-set.  In case of failure of the
+driver and error is returned.
+
+`CCIRemoveContainerResource` function is called to free the cpu resources taken
+by a container. The functions returns nil if the operation was successful or an
+error if the operation failed on the plugin side.
+
+    /* 
+    CCIDriver is a grpc service interface for CCI resource kubelet drivers.
+    The interface provides admission, allocation and cleanup entrypoints.
+    Drivers are registered  as kubelet plugins and will be called by the 
+    CCI resource manager for pods which are associated with this driver.
+    */
+    service CCIDriver {
+      //cciAdmit admission function to reserve resources for a given container
+      rpc CCIAdmit(AdmitRequest) returns (AdmitResponse) {}
+  
+      //cciAddContainerResource allocation entry point for admitted containers
+      rpc CCIAddContainerResource (AddResourceRequest)
+        returns (AddResourceResponse) {}
+
+      //cciRemoveContainerResource clea ubelet dted resources for a given container
+      rpc CCIRemoveContainerResource (RemoveResourceRequest)
+        returns (RemoveResourceResponse) {}
+    }
+
+    message AdmitRequest{
+        // currently available cpus
+        string availablecpus = 1;
+        // pod identfier 
+        string pod = 2;
+        // container identfier
+        string container = 3;
+        // cci spec for the container, subset of podspec
+        string cci_spec = 4;
+    }
+
+    message ResourceSet{
+        // admitted cpuset
+        string cpuset = 1;
+        // flag if exclisve
+        bool exclusive = 2;
+    }
+
+    message AdmitResponse {
+      // allocated resource set
+      ResourceSet rset = 1;
+      // error object 
+      string err = 2;
+    }
+
+    message AddResourceRequest{
+        // pod cgroup
+        string cgroup = 1;
+        // pod identifier
+        string pod = 2;
+        // container identifier
+        string container = 3;
+        // container id
+        string container_id = 4;
+    }
+
+
+
+    message AddResourceResponse {
+      // error object 
+      string err = 1;
+    }
+
+    message RemoveResourceRequest {
+        // id of the container to be removed
+        string container_id = 1;
+    }
+
+    message RemoveResourceResponse {
+        // error object
+        string err = 1;
+    }
+
+5.	CCI Drivers Factory API<br>
+The KEP includes a new staged API which enabled the CCI driver creation. The 
+API can be used by a driver implementor to start a driver and automatically
+register it against Kubelet's CCI Resource Manager.
+
+6.	CCI Drivers Registration<br>
+The CCI driver registration can be handled by t ubeletlet plugin framework. 
+This approach is already used with device plugins and DRA plugins. The approach
+will be sufficient the cover plugin registration, status and health-check functionality.
+
+
+##### Allocation & Container Removal Flow
+
+Admission and adding a Container:<br>
+![image](CCIAllocSeqAlpha.jpg)<br>
+Figure 2: Sequence of container allocation which uses CCI driver<br><br>
+
+
+The lifetime events are triggered by the container manager and internal lifecycle manager in 
+exact order for admitting, adding, and removing containers. As shown on Figure 2 an admit 
+container is invoked for containers inside pods requiring a resource drivere.
+On success, the resource set result is added to the CCI 
+store. If the operation 
+fails an error will be reported back to the user (in alpha we will go back to best-effort QoS). 
+All blocking rpc calls are configured in alpha with a reasonable timeout. After getting an actual container id, the internal
+lifecycle loop triggers the add container call.
+
+Container Removal:<br>
+![image](CCIRemoveSeqAlpha.jpg)<br>
+
+Figure 3: Container removal sequence diagram involving CCI plugins<br><br>
+The container removal case is described as a sequence in Figure 3. After registering
+a removal event in the internal container lifecycle, the CCI manager is triggered 
+and invokes the CCI Driver to free any resources takes by the container. On
+success, the CCI store will be also cleaned and a new available resource set will 
+be updated. All blocking rpc calls are configured with
+a reasonable timeout in alpha.
+
+##### Post-Alpha & GA Architectural Considerations
+
+After the initial alpha implementation we will consider the integration of the 
+Container Compute Interface “CCI” with exisiting CPU and Memory manager stack
+This can be achieved either by using already exising managers and introducing state 
+synchronization or through a common code base which can be invoked 
+from within the CCI Manager.
+
+A possible integation with topology manager could follow by implementing the topology hints 
+interface for the CCI Manager which will use data extracted from CCI Store. This will 
+require such data to be provided by CCI Drivers and the CCI store has to be extended to 
+be able to handle it correctly for serving the corresponding topology hints calls.
+
+ 
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+The goal is to ensure that we don't accept enhancements with inadequate testing.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+[X] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Unit tests
+
+<!--
+In principle every added code should have complete unit test coverage, so providing
+the exact set of tests will not bring additional value.
+However, if complete unit test coverage is not possible, explain the reason of it
+together with explanation why this is acceptable.
+-->
+
+<!--
+Additionally, for Alpha try to enumerate the core package you will be touching
+to implement this enhancement and provide the current unit coverage for those
+in the form of:
+- <package>: <date> - <current test coverage>
+The data can be easily read from:
+https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit
+
+This can inform certain test coverage improvements that we want to do before
+extending the production code to implement this enhancement.
+-->
+
+###### Alpha
+
+*	CCI Resource Manager: target code cvg >=80%
+*	CCI Store: target code cvg >=80%
+*	CCI CPU Manager Policy Component: target code cvg >=80%
+*	CCI Drivers Factory API: target code cvg >=80%
+
+###### BETA
+
+*	CPU Manager None and Static Policy Integration with CCI
+* Pod Admission Race tests
+* Introduce fail-safety tests
+*	Performance/Scalabilty tests
+
+
+##### Integration tests
+###### Alpha
+* CPU and CCI Manager Integration test: driver-based allocation and best-effort QoS 
+* State Consistency (CPU Manager + CCI) integrateion test
+###### BETA
+* CPU Manager None, Static Policy Integration with CCI
+* CPU, Memory, Topology and CCI Manager Integration test
+*	Further integration tests with Device Manager and DRA
+*	Integration test including static QoS and driver-based allocation
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+- <test>: <link to test coverage>
+-->
+
+##### e2e tests
+###### Alpha
+*	E2E tests including a test CCI Driver
+* E2E tests for showcasing resource allocations for cpu and memory through CCI
+
+###### BETA
+*	End-to-End tests to cover all cci resource allocation use-cases
+* End-to_End tests with device plugins and DRA
+* Performance and resource utilization tests
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+- <test>: <link to test coverage>
+-->
+
+
+
+### Graduation Criteria
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Define graduation milestones.
+
+These may be defined in terms of API maturity, [feature gate] graduations, or as
+something else. The KEP should keep this high-level with a focus on what
+signals will be looked at to determine graduation.
+
+Consider the following in developing the graduation criteria for this enhancement:
+- [Maturity levels (`alpha`, `beta`, `stable`)][maturity-levels]
+- [Feature gate][feature gate] lifecycle
+- [Deprecation policy][deprecation-policy]
+
+Clearly define what graduation means by either linking to the [API doc
+definition](https://kubernetes.io/docs/concepts/overview/kubernetes-api/#api-versioning)
+or by redefining what graduation means.
+
+In general we try to use the same stages (alpha, beta, GA), regardless of how the
+functionality is accessed.
+
+[feature gate]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[maturity-levels]: https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md#alpha-beta-and-stable-versions
+[deprecation-policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
+
+Below are some examples to consider, in addition to the aforementioned [maturity levels][maturity-levels].
+-->
+#### Alpha to Beta
+
+- Feature implemented behind a feature flag
+- Initial e2e tests completed and enabled
+- Integrate API feedback from users and community
+-	Proven cross-components consistency (ideally via tests)
+-	Handling of topology manager and memory manager use-cases
+-	Finish Identified Code refactoring of common building blocks (look at common pieces in all plugin-frameworks  in kubelet) 
+-	Look to what makes sense to leave inside Kubelet due to latency and use case requirements. Introduce community CCI drivers repo
+Similar to Kubernetes Scheduler Plugin repo
+-	Have plugins specific to common use cases or environments
+-	Smooth integration with scheduler extensions/ plugins …
+
+#### Beta to GA
+
+- Gather feedback from developers and surveys and integrate it
+-	Successful adoption across users
+-	Proven Components Correctness
+-	Performance tests completion
+- N examples of real-world usage
+
+#### Deprecation
+
+
+### Upgrade / Downgrade Strategy
+
+<!--
+If applicable, how will the component be upgraded and downgraded? Make sure
+this is in the test plan.
+
+Consider the following in developing an upgrade/downgrade strategy for this
+enhancement:
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to maintain previous behavior?
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to make use of the enhancement?
+-->
+
+### Version Skew Strategy
+
+<!--
+If applicable, how will the component handle version skew with other
+components? What are the guarantees? Make sure this is in the test plan.
+
+Consider the following in developing a version skew strategy for this
+enhancement:
+- Does this enhancement involve coordinating behavior in the control plane and
+  in the kubelet? How does an n-2 kubelet without this feature available behave
+  when this feature is used?
+- Will any other components on the node change? For example, changes to CSI,
+  CRI or CNI may require updating that component before the kubelet.
+-->
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+#### Operational Requirements for Alpha:
+
+*	Enable CCI Feature Gate
+*	Disable Memory manager 
+*	Disable Topology Manager
+* Set CPU Manager policy to None
+
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+
+Documentation is available on [feature gate lifecycle] and expectations, as
+well as the [existing list] of feature gates.
+
+[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
+-->
+
+- [x] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: CCIResourceManager
+  - Components depending on the feature gate: CCIResourceManager, CCI CPU Manger Policy, Pod Association Mechaninism, CCI APIs
+- [ ] Other
+  - Describe the mechanism:
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled).
+
+###### Does enabling the feature change any default behavior?
+No
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+Yes, a deletion of pods requiring cci driver will be recommended
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+Feature gates are typically disabled by setting the flag to `false` and
+restarting the component. No other changes should be necessary to disable the
+feature.
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+
+###### What happens if we reenable the feature if it was previously rolled back?
+Pods requiring cci driver might have been handled by standard cpu manager, those will need to 
+be recreated after enabling the cci driver.
+
+###### Are there any tests for feature enablement/disablement?
+Planned to include some
+<!--
+The e2e framework does not currently support enabling or disabling feature
+gates. However, unit tests in each component dealing with managing data, created
+with and without the feature, are necessary. At the very least, think about
+conversion tests if API types are being modified.
+
+Additionally, for features that are introducing a new API field, unit tests that
+are exercising the `switch` of feature gate itself (what happens if I disable a
+feature gate after having objects written with the new field) are also critical.
+You can take a look at one potential example of such test in:
+https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
+-->
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+The usual Kubernetes upgrade and downgrade strategy applies for in-tree components. 
+Vendors must take care that upgrades and downgrades work with the drivers that
+they provide to customers
+  
+We propose a solution of the bootstrapping problem for the cluster initialization by
+using the new CCI policy of CPU Manager. The policies allows the deployment of
+Pods not bound a CCI driver without requiring any plugin. The allocation of such
+Pods will be handled by std cpu manager. 
+  
+Pods which require CCI Driver will fail starting and report an error due to driver 
+unavailability.  If a CCI Plugin fails, Pods which were about to be allocated
+that were tied to the plugin will fail. Based on requirements a fallback mechanism 
+can be implemented where such Pods fallback to a given std. QoS Model.
+
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+Failure of CCI drivers will have impact only over pods requiring CCI driver for 
+resource management. All other pods will not be impacted. THe impacted pods will
+an error message showing failure of the driver.
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+Unhealthy CCI drivers which can't get to a ready state and repeating failures 
+to allocate pods in cases of free resources.
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+TBD in Beta
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+TBD in Beta
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+TBD in Beta
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+Pods will be running on the cluster which are associated to CCI drivers. 
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason: 
+- [x] API .status
+  - Condition name: CCI Driver readiness and CCI Status field can be introduced
+  - Other field: 
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+Pods not using CCI Driveres continue to work as before.
+Consistent state handling for Pods requiring CCI Driver and standard CPU-Manager managed pods.
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+TBD in Beta
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+None
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+A CCI Driver (daemonset) will be required to handle pods which need CCI driver resource management.
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+The CCI Resource Manager approach resembles device manager protocol. The scalability and performance impact will be similar to the case of handling device plugins.
+
+Further performance benchmarks will be done in Beta Phase
+
+
+###### Will enabling / using this feature result in any new API calls?
+
+For Pods requiring CCI Driver the admission, container add and container removal operations will be handled via 
+api call to external CCI Driver. Remaining pods should be handled as before.
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+A helper API to create CCI Drivers (CCI Driver Factory) can be integrated as staged repo.
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+No 
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+Optional CCI driver association mechanism bound to pod spec. One realistion can be an optional driver name attribute.
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+No
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+No  
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+  
+We propose a solution of the bootstrapping problem for the cluster initialization 
+by using the new CCI policy of CPU Manager. The policies allows the deployment of
+Pods not bound a CCI driver without requiring any plugin. The allocation of such Pods
+will be handled by std cpu manager. Pods which require CCI Driver will fail starting
+and report an error due to driver unavailability.
+  
+If a CCI Plugin fails, Pods which were about to be allocated will fail. Based on 
+requirements a fallback mechanism can be implemented where such Pods fallback to 
+a given std. QoS Model.
+
+![image](CCITroubleshooting.jpg)<br>
+
+Figure 4: Troubleshooting Flows: Handling of pods requiring CCI driver and standard pods wihtout CCI Driver requirements in the case of unhealthy driver.<br><br>
+
+To illustrate this lets consider some of the flows described on Figure 4. We distriguish 
+between pods handled by CCI drivers(group 1 pods) and standard Pods(group 2 pods). Until the CCI driver is not started the pod admission of group 1 pods will fail with an admission error due to driver unavailabilty. Group 2 pods are not impaceted by and they
+can be deployed on the cluster. After the CCI Driver comes online  the addmission of 
+group 1 pods can continue. The admission transactions of group 1 and 2 are mutually exclusive. In later time the driver becomes unhealthy and the admission of second pod 
+from group 1 will fail.
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+Feature lives completly in kubelet and will not be impacted directly. 
+
+###### What are other known failure modes?
+TBD
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+We plan to provide sufficient logging information inside kubelet log to help determine
+problems with CCI Resource Management Stack. Additionally CCI Driver implementations should 
+consider integrating proper logging mechanism and error messages to help determine any 
+issues during operation.
+
+## Challenges and Considerations
+###### Checkpoint assignments
+The driver must be able to pick up where it left off in case the Kubelet restarts for any reason. With regards to cpusets, the CPU Manager can reassign those values in the event of a Kubelet restart as the CPU Manager state also keeps track of Pods covered by CCI drivers. Information specific to the driver can be handled by the driver itself using a checkpoint file.
+
+## Implementation History
+
+TBD
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+## Drawbacks
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
+  We could choose, instead, to continually extend existing Kubelet managers.  This is
+  already complicated and becomes more so as users want more and more specialty use
+  cases.  Additionally, chip companies are coming up with increasingly complicated 
+  architecture and the community will not want to spend time and resources supporting
+  odd changes particular for particular chipsets.  Rather, we should choose to reduce
+  complexity within the Kubelet over time.
+
+## Infrastructure Needed 
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
+  
+We may choose to add in a repo that allows donations of plugins specific to particular
+use cases, in the way that we already do so for Kubernetes Scheduler Plugins.  This 
+will allow a central place for the community to donate useful content.
diff --git a/keps/sig-node/3675-resource-plugin-manager/kep.yaml b/keps/sig-node/3675-resource-plugin-manager/kep.yaml
new file mode 100644
index 000000000000..e52a8180ec7e
--- /dev/null
+++ b/keps/sig-node/3675-resource-plugin-manager/kep.yaml
@@ -0,0 +1,42 @@
+title: Pluggable Resource Management
+kep-number: 3675
+authors:
+  - "@obiTrinobiIntel"
+  - "@catblade"
+owning-sig: sig-node
+participating-sigs:
+  - sig-node
+status: implementable
+creation-date: "2023-02-06"
+reviewers:
+  - "@swatisehgal"
+  - "@klueska"
+approvers:
+  - "@sig-node-leads"
+
+see-also: []
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.27"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.27"
+  beta: "v1.30"
+  stable: "v1.33"
+
+# The following PRR answers are required at alpha release
+# List the feature gate name and the components for which it must be enabled
+feature-gates:
+  - name: CCIManager
+    components:
+      - kubelet
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics: []
\ No newline at end of file
diff --git a/keps/sig-release/3720-freezing-k8s-gcr-io/README.md b/keps/sig-release/3720-freezing-k8s-gcr-io/README.md
new file mode 100644
index 000000000000..4c9eec06dea2
--- /dev/null
+++ b/keps/sig-release/3720-freezing-k8s-gcr-io/README.md
@@ -0,0 +1,172 @@
+# KEP-3720: Freeze `k8s.gcr.io` image registry
+
+The change proposed by this KEP is very unusual as the engineering work will **not** be done in the k/k repository. However, this is a major change to the project hence the KEP.
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Test Plan](#test-plan)
+  - [Graduation Criteria](#graduation-criteria)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Communication Plan](#communication-plan)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+This KEP is unusual and doesn't actually make/propose changes to the Kubernetes codebase. It does propose a major change to how images of the Kubernetes are consumed hence the KEP.
+
+- [X] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [X] (R) KEP approvers have approved the KEP status as `implementable`
+- [X] (R) Design details are appropriately documented
+- [X] (N/A) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [X] (N/A) e2e Tests for all Beta API Operations (endpoints)
+  - [X] (N/A) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [X] (N/A) Minimum Two Week Window for GA e2e tests to prove flake free
+- [X] (N/A) Graduation criteria is in place
+  - [X] (N/A) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [X] (N/A) Production readiness review completed
+- [X] (N/A) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+The Kubernetes project has created and runs the `registry.k8s.io` image registry which is fully controlled by the community.
+This registry has been [GA for several months](https://kubernetes.io/blog/2022/11/28/registry-k8s-io-faster-cheaper-ga/) now and we need to freeze the old image registry.
+
+## Motivation
+
+Running public image registries is very expensive and eats up a significant chunk of the project's Infrastructure budget. We built `registry.k8s.io` image registry to serve images from various origins around the world depending on the location of the user. For example, an a kops Kubernetes cluster in eu-west-1 can pull images from an AWS S3 bucket in the same region which is very fast and more importantly very cheap for the Kubernetes project.
+
+There was a plan to redirect `k8s.gcr.io` to `registry.k8s.io` but it didn't [work out](https://kubernetes.slack.com/archives/CCK68P2Q2/p1666725317568709) so we backported the image registry defaults to 1.24, 1.23, and 1.22 so all the patch releases from December 2022 will using the new registry by default.
+
+We are currently exceeding our budget as it will take quite a while for end users to upgrade Kubernetes to v1.25 so we want to incentivise our end users to move to the new registry as fast as possible by freezing the registry by 1.27. This would mean that all subsequent image releases will not be available on the old registry.
+
+### Goals
+
+Freeze `k8s.gcr.io` image registry and push all new images and tags exclusively to the `registry.k8s.io` image registry.
+
+### Non-Goals
+
+- `registry.k8s.io` internal implementations details. That is handled separately by sig-k8s-infra.
+
+## Proposal
+
+Freeze the `k8s.gcr.io` image by not pushing any new digests or tags after 1.27 release. The 1.27 release itself won't be available on `k8s.gcr.io`.
+
+I'm proposing that on the 1st of April 2023 (10 days before 1.27 is released):
+
+- `k8s.gcr.io` is frozen and no new images will be published by any subproject.
+- last 1.23 release on `k8s.gcr.io` will be 1.23.18 (goes EoL before the freeze)
+- last 1.24 release on `k8s.gcr.io` will be 1.24.12
+- last 1.25 release on `k8s.gcr.io` will be 1.25.8
+- last 1.26 release on `k8s.gcr.io` will be 1.26.3
+- 1.27.0 will not be available `k8s.gcr.io`
+
+### Risks and Mitigations
+
+There are no risks. The old registry will still be available and you can pull the images before 1.27 on there. This change will also
+affect other users of k8s.gcr.io who should have already updated their helm charts and manifests to use the new registry.
+
+## Design Details
+
+The image promotion process is described [here](https://github.com/kubernetes/k8s.io/tree/main/k8s.gcr.io). Please read it for full details.
+
+This is the planned technical changes(grouped by repos):
+
+- k-sigs/promo-tools
+  - merge https://github.com/kubernetes-sigs/promo-tools/pull/669
+- k/k8s.io
+  - Fix https://github.com/kubernetes/k8s.io/issues/4611
+  - clean up the contents of the `registry.k8s.io` folder. Most of the content should be in k/registry.k8s.io repository
+  - duplicate the top level folder `k8s.gcr.io` in the repo and call it `registry.k8s.io`
+- k/test-infra
+  - blockade the k8s.gcr.io folder in k/k8s.io repository. blockade is a prow plugin that rejects PRs that modify specific folders/files.
+  - update the ProwJobs [post-k8sio-image-promo](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/sig-k8s-infra/trusted/releng/releng-trusted.yaml) and [pull-k8sio-cip](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/sig-k8s-infra/releng/artifact-promotion-presubmits.yaml) `thin-manifest-dir` flags to point to the new folder
+
+
+### Test Plan
+
+This is not applicable.
+
+### Graduation Criteria
+
+This is not applicable.
+
+### Upgrade / Downgrade Strategy
+
+When users upgrade to various kubernetes versions that use the new image registry, they will be able to pull images from the new
+registry.
+
+### Version Skew Strategy
+
+This is not applicable.
+
+## Communication Plan
+
+This is a major change and requires an appropriate communication plan.
+
+We plan on communicating this change via:
+- an email to k-dev
+- an email to k-announce
+- a blog post on kubernetes.io
+
+## Production Readiness Review Questionnaire
+
+This is not applicable.
+
+## Implementation History
+
+## Drawbacks
+
+This is not applicable.
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+We keep pushing new images to the old registry.
+
+## Infrastructure Needed (Optional)
+
+None as it has already been deployed.
diff --git a/keps/sig-release/3720-freezing-k8s-gcr-io/kep.yaml b/keps/sig-release/3720-freezing-k8s-gcr-io/kep.yaml
new file mode 100644
index 000000000000..e6aca27d914b
--- /dev/null
+++ b/keps/sig-release/3720-freezing-k8s-gcr-io/kep.yaml
@@ -0,0 +1,29 @@
+title: Freezing `k8s.gcr.io` image registry
+kep-number: 3720
+authors:
+  - "@upodroid"
+owning-sig: sig-release
+participating-sigs:
+  - sig-k8s-infra
+  - sig-release
+status: implementable
+creation-date: 2023-01-10
+reviewers:
+  - "@ameukam"
+  - "@cpanato"
+  - "@dims"
+  - "@jeremyrickard"
+  - "@justaugustus"
+  - "@saschagrunert"
+approvers:
+  - "@ameukam"
+  - "@cpanato"
+  - "@dims"
+  - "@jeremyrickard"
+  - "@justaugustus"
+  - "@saschagrunert"
+stage: stable
+latest-milestone: "v1.27"
+milestone:
+  stable: "v1.27"
+disable-supported: true
diff --git a/keps/sig-scheduling/2926-job-mutable-scheduling-directives/README.md b/keps/sig-scheduling/2926-job-mutable-scheduling-directives/README.md
index 64ce405d7f60..b7b85c4be876 100644
--- a/keps/sig-scheduling/2926-job-mutable-scheduling-directives/README.md
+++ b/keps/sig-scheduling/2926-job-mutable-scheduling-directives/README.md
@@ -89,6 +89,9 @@ tags, and then generate with `hack/update-toc.sh`.
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
   - [Test Plan](#test-plan)
+    - [Unit tests](#unit-tests)
+    - [Integration tests](#integration-tests)
+    - [e2e tests](#e2e-tests)
   - [Graduation Criteria](#graduation-criteria)
     - [Beta](#beta)
     - [GA](#ga)
@@ -162,8 +165,8 @@ specific partition (e.g., preemptibles) or a specific location (e.g., zone x).
 
 This is a proposal to relax update validation on jobs that have never been unsuspended to 
 allow mutating node scheduling directives, namely the pod template's node affinity, 
-node selector, tolerations, annotations and labels of the job's pod template. This enables 
-a higher-level queue controller to inject such directives before un-suspending a job to
+node selector, tolerations, schedulingGates, annotations and labels of the job's pod template. This 
+enables a higher-level queue controller to inject such directives before un-suspending a job to
 influence its placement.
 
 <!--
@@ -334,9 +337,22 @@ proposal will be implemented, this is the place to discuss them.
 ### Test Plan
 
 - Unit and integration tests veryfing that:
-  - pod template's node affinity, node selector, tolerations, annotations and labels not mutable for jobs that have been unsuspended before
-  - pod template's node affinity, node selector tolerations, annotations or labels not mutable for apps other than jobs
-  - job controller observes the update and creates pods with the new scheduling directives
+  - pod template's node affinity, node selector, tolerations, annotations and labels not mutable for jobs that have been unsuspended before.
+  - pod template's node affinity, node selector tolerations, annotations or labels not mutable for apps other than jobs.
+  - job controller observes the update and creates pods with the new scheduling directives.
+
+#### Unit tests
+
+- `k8s.io/kubernetes/pkg/registry/batch/job/`: `1/30/2023` - `76.8%`
+
+#### Integration tests
+
+Available under [Job integrations tests](https://github.com/kubernetes/kubernetes/blob/457341c3d408097025af5a9b6f5917439c0debdd/test/integration/job/job_test.go#L1397)
+
+#### e2e tests
+
+Integration tests offer enough coverage.
+
 
 <!--
 **Note:** *Not required until targeted at a release.*
@@ -534,7 +550,8 @@ kube-apiserver will accept node affinity/tolerations updates of jobs.
 
 ###### Are there any tests for feature enablement/disablement?
 
-No, unit and integration tests will be added prior to Beta graduation.
+Tests are in place. See [Job integrations tests](https://github.com/kubernetes/kubernetes/blob/457341c3d408097025af5a9b6f5917439c0debdd/test/integration/job/job_test.go#L1397).
+The tests do not directly test switching between enable/disable, that was tested manually (see below).
 
 <!--
 The e2e framework does not currently support enabling or disabling feature
@@ -551,7 +568,8 @@ This section must be completed when targeting beta to a release.
 
 ###### How can a rollout or rollback fail? Can it impact already running workloads?
 
-The change is opt-in, it doesn't impact already running workloads.
+The change is opt-in, it doesn't impact already running workloads. But problems with the 
+updated validation logic may cause crashes in the apiserver.
 
 <!--
 Try to be as paranoid as possible - e.g., what if some components will restart
@@ -565,7 +583,7 @@ will rollout across nodes.
 
 ###### What specific metrics should inform a rollback?
 
-N/A
+Crashes in the apiserver because of potential problems with the updated validation logic.
 
 <!--
 What signals should users be paying attention to when the feature is young
@@ -574,7 +592,16 @@ that might indicate a serious problem?
 
 ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
 
-It will be tested manually prior to beta launch.
+Tested manually. 
+1. Cluster on 1.22 was upgraded to 1.23
+2. Created a suspended job
+3. Mutated scheduling directives and checked that mutation was successful
+4. Started the job, verifying that the mutation took effect in practice as well
+5. Created a suspended job
+6. Downgraded the cluster back to 1.22
+7. Attempt to mutate the scheduling directives was rejected.
+
+Findings: the feature and the cluster are behaving as expected.
 
 <!--
 Describe manual testing that was done and the outcomes.
@@ -820,7 +847,9 @@ N/A.
 ## Implementation History
 
 - 2021-09-01: Proposed KEP starting in beta status.
-- 2021-10-28: Updated the KEP to include annotations and labels of the pod template
+- 2021-10-28: Updated the KEP to include annotations and labels of the pod template.
+- 2023-01-03: Updated the KEP to include schedulingGates of the pod template and graduated the feature to stable.
+
 
 <!--
 Major milestones in the lifecycle of a KEP should be tracked in this section.
diff --git a/keps/sig-scheduling/2926-job-mutable-scheduling-directives/kep.yaml b/keps/sig-scheduling/2926-job-mutable-scheduling-directives/kep.yaml
index f8a1859c5ffb..7d36265c9466 100644
--- a/keps/sig-scheduling/2926-job-mutable-scheduling-directives/kep.yaml
+++ b/keps/sig-scheduling/2926-job-mutable-scheduling-directives/kep.yaml
@@ -18,17 +18,17 @@ see-also:
   - "/keps/sig-apps/2232-suspend-jobs"
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: beta
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.23"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   beta: "v1.23"
-  stable: "v1.25"
+  stable: "v1.27"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
diff --git a/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/README.md b/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/README.md
index a773483e53ed..d26f7050424b 100644
--- a/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/README.md
+++ b/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/README.md
@@ -131,7 +131,7 @@ checklist items _must_ be updated for the enhancement to be released.
 Items marked with (R) are required *prior to targeting to a milestone / release*.
 
 - [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
-- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [x] (R) KEP approvers have approved the KEP status as `implementable`
 - [x] (R) Design details are appropriately documented
 - [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
   - [ ] e2e Tests for all Beta API Operations (endpoints)
@@ -142,7 +142,7 @@ Items marked with (R) are required *prior to targeting to a milestone / release*
 - [ ] (R) Production readiness review completed
 - [ ] (R) Production readiness review approved
 - [x] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [x] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
 - [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 <!--
@@ -640,7 +640,7 @@ feature gate after having objects written with the new field) are also critical.
 You can take a look at one potential example of such test in:
 https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
 -->
-No, unit and integration tests will be added.
+No. The unit tests that are exercising the `switch` of feature gate itself  will be added.
 
 ### Rollout, Upgrade and Rollback Planning
 
@@ -659,6 +659,12 @@ feature flags will be enabled on some API servers and not others during the
 rollout. Similarly, consider large clusters and how enablement/disablement
 will rollout across nodes.
 -->
+It won't impact already running workloads because it is an opt-in feature in scheduler.
+But during a rolling upgrade, if some apiservers have not enabled the feature, they will not
+be able to accept and store the field "MatchLabelKeys" and the pods associated with these 
+apiservers will not be able to use this feature. As a result, pods belonging to the 
+same deployment may have different scheduling outcomes.
+
 
 ###### What specific metrics should inform a rollback?
 
@@ -666,6 +672,9 @@ will rollout across nodes.
 What signals should users be paying attention to when the feature is young
 that might indicate a serious problem?
 -->
+- If the metric `schedule_attempts_total{result="error|unschedulable"}` increased significantly after pods using this feature are added.
+- If the metric `plugin_execution_duration_seconds{plugin="PodTopologySpread"}` increased to higher than 100ms on 90% after pods using this feature are added.  
+
 
 ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
 
@@ -674,12 +683,60 @@ Describe manual testing that was done and the outcomes.
 Longer term, we may want to require automated upgrade/rollback tests, but we
 are missing a bunch of machinery and tooling and can't do that now.
 -->
+Yes, it was tested manually by following the steps below, and it was working at intended.
+1. create a kubernetes cluster v1.26 with 3 nodes where `MatchLabelKeysInPodTopologySpread` feature is disabled.
+2. deploy a deployment with this yaml
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nginx
+spec:
+  replicas: 12 
+  selector:
+    matchLabels:
+      foo: bar
+  template:
+    metadata:
+      labels:
+        foo: bar
+    spec:
+      restartPolicy: Always
+      containers:
+      - name: nginx
+        image: nginx:1.14.2
+      topologySpreadConstraints:
+        - maxSkew: 1
+          topologyKey: kubernetes.io/hostname
+          whenUnsatisfiable: DoNotSchedule
+          labelSelector:
+            matchLabels:
+              foo: bar
+          matchLabelKeys:
+            - pod-template-hash
+```
+3. pods spread across nodes as 4/4/4
+4. update the deployment nginx image to `nginx:1.15.0`
+5. pods spread across nodes as 5/4/3
+6. delete deployment nginx
+7. upgrade kubenetes cluster to v1.27 (at master branch) while `MatchLabelKeysInPodTopologySpread` is enabled.
+8. deploy a deployment nginx like step2
+9. pods spread across nodes as 4/4/4
+10. update the deployment nginx image to `nginx:1.15.0`
+11. pods spread across nodes as 4/4/4
+12. delete deployment nginx
+13. downgrade kubenetes cluster to v1.26  where `MatchLabelKeysInPodTopologySpread` feature is enabled.
+14. deploy a deployment nginx like step2
+15. pods spread across nodes as 4/4/4
+16. update the deployment nginx image to `nginx:1.15.0`
+17. pods spread across nodes as 4/4/4
 
 ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
 
 <!--
 Even if applying deprecation policies, they may still surprise some users.
 -->
+No.
 
 ### Monitoring Requirements
 
@@ -694,6 +751,7 @@ Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
 checking if there are objects with field X set) may be a last resort. Avoid
 logs or events for this purpose.
 -->
+Operator can query pods that have the `pod.spec.topologySpreadConstraints.matchLabelKeys` field set to determine if the feature is in use by workloads. 
 
 ###### How can someone using this feature know that it is working for their instance?
 
@@ -706,13 +764,8 @@ and operation of this feature.
 Recall that end users cannot usually observe component logs or access metrics.
 -->
 
-- [ ] Events
-  - Event Reason: 
-- [ ] API .status
-  - Condition name: 
-  - Other field: 
-- [ ] Other (treat as last resort)
-  - Details:
+- [x] Other (treat as last resort)
+  - Details: We can determine if this feature is being used by checking deployments that have only `MatchLabelKeys` set in `TopologySpreadConstraint` and no `LabelSelector`. These Deployments will strictly adhere to TopologySpread after both deployment and rolling upgrades if the feature is being used.
 
 ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
 
@@ -730,6 +783,7 @@ high level (needs more precise definitions) those may be things like:
 These goals will help you determine what you need to measure (SLIs) in the next
 question.
 -->
+Metric plugin_execution_duration_seconds{plugin="PodTopologySpread"} <= 100ms on 90-percentile.
 
 ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
 
@@ -737,12 +791,10 @@ question.
 Pick one more of these and delete the rest.
 -->
 
-- [ ] Metrics
-  - Metric name:
-  - [Optional] Aggregation method:
-  - Components exposing the metric:
-- [ ] Other (treat as last resort)
-  - Details:
+- [x] Metrics
+  - Component exposing the metric: kube-scheduler
+    - Metric name: `plugin_execution_duration_seconds{plugin="PodTopologySpread"}`
+    - Metric name: `schedule_attempts_total{result="error|unschedulable"}`
 
 ###### Are there any missing metrics that would be useful to have to improve observability of this feature?
 
@@ -750,6 +802,8 @@ Pick one more of these and delete the rest.
 Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
 implementation difficulties, etc.).
 -->
+Yes. It's helpful if we have the metrics to see which plugins affect to scheduler's decisions in Filter/Score phase. 
+There is the related issue: https://github.com/kubernetes/kubernetes/issues/110643 . It's very big and still on the way.
 
 ### Dependencies
 
@@ -773,6 +827,7 @@ and creating new ones, as well as about cluster-level services (e.g. DNS):
       - Impact of its outage on the feature:
       - Impact of its degraded performance or high-error rates on the feature:
 -->
+No.
 
 ### Scalability
 
@@ -800,6 +855,7 @@ Focusing mostly on:
   - periodic API calls to reconcile state (e.g. periodic fetching state,
     heartbeats, leader election, etc.)
 -->
+No.
 
 ###### Will enabling / using this feature result in introducing new API types?
 
@@ -809,6 +865,7 @@ Describe them, providing:
   - Supported number of objects per cluster
   - Supported number of objects per namespace (for namespace-scoped objects)
 -->
+No.
 
 ###### Will enabling / using this feature result in any new calls to the cloud provider?
 
@@ -817,6 +874,7 @@ Describe them, providing:
   - Which API(s):
   - Estimated increase:
 -->
+No.
 
 ###### Will enabling / using this feature result in increasing size or count of the existing API objects?
 
@@ -826,6 +884,7 @@ Describe them, providing:
   - Estimated increase in size: (e.g., new annotation of size 32B)
   - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
 -->
+No.
 
 ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
 
@@ -837,6 +896,8 @@ Think about adding additional work or introducing new steps in between
 
 [existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
 -->
+Yes. there is an additional work: the scheduler will use the keys in `matchLabelKeys` to look up label values from the pod and AND with `LabelSelector`.
+Maybe result in a very samll impact in scheduling latency which directly contributes to pod-startup-latency SLO.
 
 ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
 
@@ -849,6 +910,7 @@ This through this both in small and large cases, again with respect to the
 
 [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
 -->
+No.
 
 ### Troubleshooting
 
@@ -861,6 +923,8 @@ details). For now, we leave it here.
 -->
 
 ###### How does this feature react if the API server and/or etcd is unavailable?
+If the API server and/or etcd is not available, this feature will not be available. 
+This is because the scheduler needs to update the scheduling results to the pod via the API server/etcd.
 
 ###### What are other known failure modes?
 
@@ -876,8 +940,18 @@ For each of them, fill in the following information by copying the below templat
       Not required until feature graduated to beta.
     - Testing: Are there any tests for failure mode? If not, describe why.
 -->
+N/A
 
 ###### What steps should be taken if SLOs are not being met to determine the problem?
+- Check the metric `plugin_execution_duration_seconds{plugin="PodTopologySpread"}` to determine 
+  if the latency increased. If increased, it means this feature may increased scheduling latency. 
+  You can disable the feature `MatchLabelKeysInPodTopologySpread` to see if it's the cause of the 
+  increased latency.
+- Check the metric `schedule_attempts_total{result="error|unschedulable"}` to determine if the number 
+  of attempts increased. If increased, You need to determine the cause of the failure by the event of 
+  the pod. If it's caused by plugin `PodTopologySpread`, You can further analyze this problem by looking 
+  at the scheduler log.
+
 
 ## Implementation History
 
@@ -892,6 +966,8 @@ Major milestones might include:
 - when the KEP was retired or superseded
 -->
  - 2022-03-17: Initial KEP
+ - 2022-06-08: KEP merged
+ - 2023-01-16: Graduate to Beta
 
 ## Drawbacks
 
diff --git a/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/kep.yaml b/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/kep.yaml
index 41f25fbd9344..4ec316524333 100644
--- a/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/kep.yaml
+++ b/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades/kep.yaml
@@ -3,7 +3,7 @@ kep-number: 3243
 authors:
   - "@denkensk"
 owning-sig: sig-scheduling
-status: provisional
+status: implementable
 creation-date: 2022-03-17
 reviewers:
   - "@ahg-g"
@@ -17,18 +17,18 @@ see-also:
   - "/keps/sig-scheduling/3094-pod-topology-spread-considering-taints"
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: alpha
+stage: beta
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.25"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.25"
-  beta: "v1.26"
-  stable: "v1.28"
+  beta: "v1.27"
+  stable: "v1.29"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
@@ -39,3 +39,8 @@ feature-gates:
       - kube-scheduler
 
 disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics:
+- plugin_execution_duration_seconds{plugin="PodTopologySpread"}
+- schedule_attempts_total{result="error|unschedulable"}
diff --git a/keps/sig-scheduling/3521-pod-scheduling-readiness/README.md b/keps/sig-scheduling/3521-pod-scheduling-readiness/README.md
index de1076b506ff..120f6eb6a41c 100644
--- a/keps/sig-scheduling/3521-pod-scheduling-readiness/README.md
+++ b/keps/sig-scheduling/3521-pod-scheduling-readiness/README.md
@@ -76,6 +76,7 @@ tags, and then generate with `hack/update-toc.sh`.
 - [Motivation](#motivation)
   - [Goals](#goals)
   - [Non-Goals](#non-goals)
+  - [Future Work](#future-work)
 - [Proposal](#proposal)
   - [User Stories (Optional)](#user-stories-optional)
     - [Story 1](#story-1)
@@ -140,9 +141,9 @@ Items marked with (R) are required *prior to targeting to a milestone / release*
   - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
 - [ ] (R) Production readiness review completed
 - [ ] (R) Production readiness review approved
-- [ ] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
-- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+- [x] "Implementation History" section is up-to-date for milestone
+- [x] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [x] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 <!--
 **Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
@@ -228,6 +229,11 @@ and make progress.
 - Enforce updating the Pod's conditions to expose more context for scheduling-paused Pods.
 - Focus on in-house use-cases of the Enqueue extension point.
 
+### Future Work
+
+- Permission control on individual schedulingGate (via 
+[fine-grained permissions](https://docs.google.com/document/d/11g9nnoRFcOoeNJDUGAWjlKthowEVM3YGrJA3gLzhpf4))
+
 ## Proposal
 
 <!--
@@ -307,10 +313,16 @@ API Server will return a validation error.
     | unscheduled Pod<br>(nil <tt>nodeName</tt>)   | ✅ create<br>❌ update   | ✅ create<br>✅ update    |
     | scheduled Pod<br>(non-nil <tt>nodeName</tt>) | ❌ create<br>❌ update   | ✅ create<br>✅ update    |
 
-- **New field disabled in Alpha but not scheduler extension:** In Alpha, the new Pod field is disabled
-by default. However, the scheduler's extension point is activated no matter the feature gate is enabled
-or not. This enables scheduler plugin developers to tryout the feature even in Alpha, by crafting
-different enqueue plugins and wire with custom fields or conditions.
+- **New field disabled in Alpha but not scheduler extension:** In a high level, this feature contains
+the following parts:
+  
+  - a. Pod's spec, feature gate, and other misc bits describing the field `schedulingGates`
+  - b. the `SchedulingGates` scheduler plugin
+  - c. the underlying scheduler framework, i.e., the new PreEnqueue extension point
+
+  If the feature is disabled, part `a` and `b` are disabled, but `c` is enabled anyways. This implies
+  a scheduler plugin developer can leverage part `c` _only_ to craft their own `a'` and `b'` to
+  accomplish their own feature set.
 
 - **New phase literal in kubectl:** To provide better UX, we're going to add a new phase literal
 `SchedulingPaused` to the "phase" column of `kubectl get pod`. This new literal indicates whether it's
@@ -521,10 +533,9 @@ The following scenarios need to be covered in integration tests:
 can be moved back to activeQ when `.spec.schedulingGates` is all cleared
 - Ensure no significant performance degradation
 
-- `test/integration/scheduler/queue_test.go`: Will add new tests.
-- `test/integration/scheduler/plugins/plugins_test.go`: Will add new tests.
-- `test/integration/scheduler/enqueue/enqueue_test.go`: Will add new tests.
-- `test/integration/scheduler_perf/scheduler_perf_test.go`: https://storage.googleapis.com/k8s-triage/index.html?test=BenchmarkPerfScheduling
+- `test/integration/scheduler/queue_test.go#TestSchedulingGates`: [k8s-triage](https://storage.googleapis.com/k8s-triage/index.html?sig=scheduling&test=TestSchedulingGates)
+- `test/integration/scheduler/plugins/plugins_test.go#TestPreEnqueuePlugin`: [k8s-triage](https://storage.googleapis.com/k8s-triage/index.html?sig=scheduling&test=TestPreEnqueuePlugin)
+- `test/integration/scheduler_perf/scheduler_perf_test.go`: will add in Beta. (https://storage.googleapis.com/k8s-triage/index.html?test=BenchmarkPerfScheduling)
 
 ##### e2e tests
 
@@ -538,15 +549,15 @@ https://storage.googleapis.com/k8s-triage/index.html
 We expect no non-infra related flakes in the last month as a GA graduation criteria.
 -->
 
-Create a test with the following sequences:
+An e2e test was created in Alpha with the following sequences:
 
-- Provision a cluster with feature gate `PodSchedulingReadiness=true` (we may need to setup a testgrid
-for when it's alpha)
 - Create a Pod with non-nil `.spec.schedulingGates`.
 - Wait for 15 seconds to ensure (and then verify) it did not get scheduled.
 - Clear the Pod's `.spec.schedulingGates` field.
 - Wait for 5 seconds for the Pod to be scheduled; otherwise error the e2e test.
 
+In beta, it will be enabled by default in the [k8s-triage](https://storage.googleapis.com/k8s-triage/index.html?sig=scheduling&test=Feature%3APodSchedulingReadiness) dashboard.
+
 ### Graduation Criteria
 
 <!--
@@ -622,8 +633,6 @@ in back-to-back releases.
 #### Beta
 
 - Feature enabled by default.
-- Permission control on individual schedulingGate is applicable (via 
-[fine-grained permissions](https://docs.google.com/document/d/11g9nnoRFcOoeNJDUGAWjlKthowEVM3YGrJA3gLzhpf4)).
 - Gather feedback from developers and out-of-tree plugins.
 - Benchmark tests passed, and there is no performance degradation.
 - Update documents to reflect the changes.
@@ -770,7 +779,7 @@ You can take a look at one potential example of such test in:
 https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
 -->
 
-Appropriate tests will be added in Alpha. See [Test Plan](#test-plan) for more details.
+Appropriate tests have been added in the integration tests. See [Integration tests](#integration-tests) for more details.
 
 ### Rollout, Upgrade and Rollback Planning
 
@@ -790,6 +799,12 @@ rollout. Similarly, consider large clusters and how enablement/disablement
 will rollout across nodes.
 -->
 
+It shouldn't impact already running workloads. It's an opt-in feature, and users need to set
+`.spec.schedulingGates` field to use this feature.
+
+When this feature is disabled by the feature flag, the already created Pod's `.spec.schedulingGates`
+field is preserved, however, the newly created Pod's `.spec.schedulingGates` field is silently dropped.
+
 ###### What specific metrics should inform a rollback?
 
 <!--
@@ -797,6 +812,13 @@ What signals should users be paying attention to when the feature is young
 that might indicate a serious problem?
 -->
 
+A rollback might be considered if the metric `scheduler_pending_pods{queue="gated"}` stays in a
+high watermark for a long time since it may indicate that some controllers are not properly handling
+removing the scheduling gates, which causes the pods to stay in pending state.
+
+Another indicator for rollback is the 90-percentile value of metric `scheduler_plugin_execution_duration_seconds{plugin="SchedulingGates"}`
+exceeds 100ms steadily.
+
 ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
 
 <!--
@@ -805,12 +827,20 @@ Longer term, we may want to require automated upgrade/rollback tests, but we
 are missing a bunch of machinery and tooling and can't do that now.
 -->
 
+It will be tested manually prior to beta launch.
+
+<<UNRESOLVED>>
+Add detailed scenarios and result here, and cc @wojtek-t.
+<</UNRESOLVED>>
+
 ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
 
 <!--
 Even if applying deprecation policies, they may still surprise some users.
 -->
 
+No.
+
 ### Monitoring Requirements
 
 <!--
@@ -820,13 +850,19 @@ For GA, this section is required: approvers should be able to confirm the
 previous answers based on experience in the field.
 -->
 
-Add a label `notReady` to the existing metric `pending_pods` to distinguish unschedulable Pods:
+A new "queue" type `gated` is added to the existing metric `scheduler_pending_pods` to distinguish
+unschedulable Pods:
 
-- `unschedulable` (existing): scheduler tried but cannot find any Node to host the Pod
-- `notReady` (new): scheduler respect the Pod's present `schedulingGates` and hence not schedule it
+- `scheduler_pending_pods{queue="unschedulable"}` (existing): scheduler tried but cannot find any
+Node to host the Pod
+- `scheduler_pending_pods{queue="gated"}` (new): scheduler respect the Pod's present `schedulingGates`
+and hence not schedule it
+
+The metric `scheduler_plugin_execution_duration_seconds{plugin="SchedulingGates"}` gives a histogram
+to show the Nth percentile value how SchedulingGates plugin is executed.
 
 Moreover, to explicitly indicate a Pod's scheduling-unready state, a condition
-`{type:PodScheduled, reason:WaitingForGates}` is introduced.
+`{type:PodScheduled, reason:SchedulingGated}` is introduced.
 
 ###### How can an operator determine if the feature is in use by workloads?
 
@@ -836,6 +872,10 @@ checking if there are objects with field X set) may be a last resort. Avoid
 logs or events for this purpose.
 -->
 
+- observe non-zero value for the metric `pending_pods{queue="gated"}`
+- observe entries for the metric `scheduler_plugin_execution_duration_seconds{plugin="SchedulingGates"}`
+- observe non-empty value in a Pod's `.spec.schedulingGates` field
+
 ###### How can someone using this feature know that it is working for their instance?
 
 <!--
@@ -854,6 +894,11 @@ Recall that end users cannot usually observe component logs or access metrics.
 
 - [x] API .spec
   - Other field: `schedulingGates`
+- [x] Events
+  - Event Type: PodScheduled
+  - Event Status: False
+  - Event Reason: SchedulingGated
+  - Event Message: Scheduling is blocked due to non-empty scheduling gates
 
 ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
 
@@ -872,18 +917,18 @@ These goals will help you determine what you need to measure (SLIs) in the next
 question.
 -->
 
+N/A.
+
 ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
 
 <!--
 Pick one more of these and delete the rest.
 -->
 
-- [ ] Metrics
-  - Metric name:
-  - [Optional] Aggregation method:
-  - Components exposing the metric:
-- [ ] Other (treat as last resort)
-  - Details:
+- [x] Metrics
+  - Metric name: scheduler_pending_pods{queue="gated"}
+  - Metric name: scheduler_plugin_execution_duration_seconds{plugin="SchedulingGates"}
+  - Components exposing the metric: kube-scheduler
 
 ###### Are there any missing metrics that would be useful to have to improve observability of this feature?
 
@@ -892,12 +937,16 @@ Describe the metrics themselves and the reasons why they weren't added (e.g., co
 implementation difficulties, etc.).
 -->
 
+N/A.
+
 ### Dependencies
 
 <!--
 This section must be completed when targeting beta to a release.
 -->
 
+N/A.
+
 ###### Does this feature depend on any specific services running in the cluster?
 
 <!--
@@ -915,6 +964,8 @@ and creating new ones, as well as about cluster-level services (e.g. DNS):
       - Impact of its degraded performance or high-error rates on the feature:
 -->
 
+N/A.
+
 ### Scalability
 
 <!--
@@ -942,6 +993,9 @@ Focusing mostly on:
     heartbeats, leader election, etc.)
 -->
 
+The feature itself doesn't generate API calls. But it's expected the API Server would receive
+additional update/patch requests to mutate the scheduling gates, by external controllers.
+
 ###### Will enabling / using this feature result in introducing new API types?
 
 <!--
@@ -951,6 +1005,8 @@ Describe them, providing:
   - Supported number of objects per namespace (for namespace-scoped objects)
 -->
 
+No.
+
 ###### Will enabling / using this feature result in any new calls to the cloud provider?
 
 <!--
@@ -959,6 +1015,8 @@ Describe them, providing:
   - Estimated increase:
 -->
 
+No.
+
 ###### Will enabling / using this feature result in increasing size or count of the existing API objects?
 
 <!--
@@ -968,6 +1026,11 @@ Describe them, providing:
   - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
 -->
 
+- No to existing API objects that doesn't use this feature.
+- For API objects that use this feature:
+  - API type: Pod
+  - Estimated increase in size: new field `.spec.schedulingGates` about ~64 bytes (in the case of 2 scheduling gates)
+
 ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
 
 <!--
@@ -979,6 +1042,8 @@ Think about adding additional work or introducing new steps in between
 [existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
 -->
 
+No.
+
 ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
 
 <!--
@@ -991,6 +1056,8 @@ This through this both in small and large cases, again with respect to the
 [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
 -->
 
+No.
+
 ### Troubleshooting
 
 <!--
@@ -1006,6 +1073,12 @@ details). For now, we leave it here.
 
 ###### How does this feature react if the API server and/or etcd is unavailable?
 
+During the downtime of API server and/or etcd:
+
+- Running workloads that don't need to remove their scheduling gates function well.
+- Running workloads that need to update their scheduling gates will stay in scheduling gated state
+as API requests will be rejected.
+
 ###### What are other known failure modes?
 
 <!--
@@ -1021,8 +1094,13 @@ For each of them, fill in the following information by copying the below templat
     - Testing: Are there any tests for failure mode? If not, describe why.
 -->
 
+In a highly-available cluster, if there are skewed API Servers, some update requests
+may get accepted and some may get rejected.
+
 ###### What steps should be taken if SLOs are not being met to determine the problem?
 
+N/A.
+
 ## Implementation History
 
 <!--
@@ -1036,7 +1114,8 @@ Major milestones might include:
 - when the KEP was retired or superseded
 -->
 
-- 2022-09-16 - Initial Proposal
+- 2022-09-16: Initial KEP
+- 2022-01-14: Graduate the feature to Beta
 
 ## Drawbacks
 
@@ -1052,11 +1131,11 @@ not need to be as detailed as the proposal, but should include enough
 information to express the idea and why it was not acceptable.
 -->
 
-- Define a boolean filed `.spec.schedulingPaused`. Its value is optionally initialized to `True` to
+Define a boolean filed `.spec.schedulingPaused`. Its value is optionally initialized to `True` to
 indicate it's not scheduling-ready, and flipped to `False` (by a controller) afterwards to trigger
 this Pod's scheduling cycle.
 
-  This approach is not chosen because it cannot support multiple independent controllers to control
+This approach is not chosen because it cannot support multiple independent controllers to control
 specific aspect a Pod's scheduling readiness.
 
 ## Infrastructure Needed (Optional)
diff --git a/keps/sig-scheduling/3521-pod-scheduling-readiness/kep.yaml b/keps/sig-scheduling/3521-pod-scheduling-readiness/kep.yaml
index 3c865a57e01c..b864c45495c0 100644
--- a/keps/sig-scheduling/3521-pod-scheduling-readiness/kep.yaml
+++ b/keps/sig-scheduling/3521-pod-scheduling-readiness/kep.yaml
@@ -18,12 +18,12 @@ see-also:
   - "/keps/sig-scheduling/624-scheduling-framework"
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: alpha
+stage: beta
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.26"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
@@ -42,4 +42,4 @@ disable-supported: true
 
 # The following PRR answers are required at beta release
 metrics:
-  - TBD
+  - scheduler_pending_pods{queue="gated"}
diff --git a/keps/sig-scheduling/3633-matchlabelkeys-to-podaffinity/README.md b/keps/sig-scheduling/3633-matchlabelkeys-to-podaffinity/README.md
new file mode 100644
index 000000000000..6d00d042667e
--- /dev/null
+++ b/keps/sig-scheduling/3633-matchlabelkeys-to-podaffinity/README.md
@@ -0,0 +1,924 @@
+<!--
+**Note:** When your KEP is complete, all of these comment blocks should be removed.
+
+To get started with this template:
+
+- [ ] **Pick a hosting SIG.**
+  Make sure that the problem space is something the SIG is interested in taking
+  up. KEPs should not be checked in without a sponsoring SIG.
+- [ ] **Create an issue in kubernetes/enhancements**
+  When filing an enhancement tracking issue, please make sure to complete all
+  fields in that template. One of the fields asks for a link to the KEP. You
+  can leave that blank until this KEP is filed, and then go back to the
+  enhancement and add the link.
+- [ ] **Make a copy of this template directory.**
+  Copy this template into the owning SIG's directory and name it
+  `NNNN-short-descriptive-title`, where `NNNN` is the issue number (with no
+  leading-zero padding) assigned to your enhancement above.
+- [ ] **Fill out as much of the kep.yaml file as you can.**
+  At minimum, you should fill in the "Title", "Authors", "Owning-sig",
+  "Status", and date-related fields.
+- [ ] **Fill out this file as best you can.**
+  At minimum, you should fill in the "Summary" and "Motivation" sections.
+  These should be easy if you've preflighted the idea of the KEP with the
+  appropriate SIG(s).
+- [ ] **Create a PR for this KEP.**
+  Assign it to people in the SIG who are sponsoring this process.
+- [ ] **Merge early and iterate.**
+  Avoid getting hung up on specific details and instead aim to get the goals of
+  the KEP clarified and merged quickly. The best way to do this is to just
+  start with the high-level sections and fill out details incrementally in
+  subsequent PRs.
+
+Just because a KEP is merged does not mean it is complete or approved. Any KEP
+marked as `provisional` is a working document and subject to change. You can
+denote sections that are under active debate as follows:
+
+```
+<<[UNRESOLVED optional short context or usernames ]>>
+Stuff that is being argued.
+<<[/UNRESOLVED]>>
+```
+
+When editing KEPS, aim for tightly-scoped, single-topic PRs to keep discussions
+focused. If you disagree with what is already in a document, open a new PR
+with suggested changes.
+
+One KEP corresponds to one "feature" or "enhancement" for its whole lifecycle.
+You do not need a new KEP to move from beta to GA, for example. If
+new details emerge that belong in the KEP, edit the KEP. Once a feature has become
+"implemented", major changes should get new KEPs.
+
+The canonical place for the latest set of instructions (and the likely source
+of this file) is [here](/keps/NNNN-kep-template/README.md).
+
+**Note:** Any PRs to move a KEP to `implementable`, or significant changes once
+it is marked `implementable`, must be approved by each of the KEP approvers.
+If none of those approvers are still appropriate, then changes to that list
+should be approved by the remaining approvers and/or the owning SIG (or
+SIG Architecture for cross-cutting KEPs).
+-->
+# KEP-3633: Introduce MatchLabelKeys to PodAffinity and PodAntiAffinity
+
+<!--
+This is the title of your KEP. Keep it short, simple, and descriptive. A good
+title can help communicate what the KEP is and should be considered as part of
+any review.
+-->
+
+<!--
+A table of contents is helpful for quickly jumping to sections of a KEP and for
+highlighting any additional information provided beyond the standard KEP
+template.
+
+Ensure the TOC is wrapped with
+  <code>&lt;!-- toc --&rt;&lt;!-- /toc --&rt;</code>
+tags, and then generate with `hack/update-toc.sh`.
+-->
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [User Stories (Optional)](#user-stories-optional)
+    - [Story 1](#story-1)
+  - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
+    - [Beta](#beta)
+    - [GA](#ga)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+<!--
+This section is incredibly important for producing high-quality, user-focused
+documentation such as release notes or a development roadmap. It should be
+possible to collect this information before implementation begins, in order to
+avoid requiring implementors to split their attention between writing release
+notes and implementing the feature itself. KEP editors and SIG Docs
+should help to ensure that the tone and content of the `Summary` section is
+useful for a wide audience.
+
+A good summary is probably at least a paragraph in length.
+
+Both in this section and below, follow the guidelines of the [documentation
+style guide]. In particular, wrap lines to a reasonable length, to make it
+easier for reviewers to cite specific portions, and to minimize diff churn on
+updates.
+
+[documentation style guide]: https://github.com/kubernetes/community/blob/master/contributors/guide/style-guide.md
+-->
+
+This KEP proposes introducing a complementary field `MatchLabelKeys` to `PodAffinityTerm`.
+This enables users to finely control the scope where Pods are expected to co-exist (PodAffinity)
+or not (PodAntiAffinity), on top of the existing `LabelSelector`.
+
+## Motivation
+
+<!--
+This section is for explicitly listing the motivation, goals, and non-goals of
+this KEP.  Describe why the change is important and the benefits to users. The
+motivation section can optionally provide links to [experience reports] to
+demonstrate the interest in a KEP within the wider Kubernetes community.
+
+[experience reports]: https://github.com/golang/go/wiki/ExperienceReports
+-->
+
+During a workload's rolling upgrade, depending on its `upgradeStrategy`, old and new versions of
+Pods may co-exist in the cluster. 
+As scheduler cannot distinguish "old" from "new", it cannot properly honor the API semantics of 
+PodAffinity and PodAntiAffiniy during the upgrade. 
+In the worse case, new version of Pod cannot be scheduled if it's a saturated cluster: [1], [2], [3].
+
+[1]: https://github.com/kubernetes/kubernetes/issues/56539#issuecomment-449757010
+[2]: https://github.com/kubernetes/kubernetes/issues/90151
+[3]: https://github.com/kubernetes/kubernetes/issues/103584
+
+On the other hand, on an idle cluster, this can cause the scheduling result sub-optimal because
+some qualifying Nodes are filtered out incorrectly.
+
+The same issue applies to other scheduling directives as well. For example, MatchLabelKeys was introduced in topologyConstaint in [KEP-3243: Respect PodTopologySpread after rolling upgrades](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades).
+
+### Goals
+
+- Introduce `MatchLabelKeys` in `PodAffinityTerm` to let users define the scope where Pods are evaluated in required and preferred Pod(Anti)Affinity.
+
+### Non-Goals
+
+<!--
+What is out of scope for this KEP? Listing non-goals helps to focus discussion
+and make progress.
+-->
+
+- Apply additional internal labels when evaluating `MatchLabelKeys`
+
+## Proposal
+
+<!--
+This is where we get down to the specifics of what the proposal actually is.
+This should have enough detail that reviewers can understand exactly what
+you're proposing, but should not include things like API designs or
+implementation. What is the desired outcome and how do we measure success?.
+The "Design Details" section below is for the real
+nitty-gritty.
+-->
+
+### User Stories (Optional)
+
+<!--
+Detail the things that people will be able to do if this KEP is implemented.
+Include as much detail as possible so that people can understand the "how" of
+the system. The goal here is to make this feel real for users without getting
+bogged down.
+-->
+
+#### Story 1
+
+When users run a rolling update with a deployment that uses required PodAffinity,
+and they want only replicas from the same replicaset to be evaluated.
+
+The deployment controller adds [pod-template-hash](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#pod-template-hash-label) to underlying ReplicaSet and thus every Pod created from Deployment carries the hash string.
+
+Therefore, users can use `pod-template-hash` in `MatchlabelKeys` to inform the scheduler to only evaluate Pods with the same `pod-template-hash` value.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: application-server
+...
+  affinity:
+    podAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+      - labelSelector:
+          matchExpressions:
+          - key: app
+            operator: In
+            values:
+            - database
+        topologyKey: topology.kubernetes.io/zone
+        matchlabelKeys: # ADDED
+        - pod-template-hash
+```
+
+### Notes/Constraints/Caveats (Optional)
+
+<!--
+What are the caveats to the proposal?
+What are some important details that didn't come across above?
+Go in to as much detail as necessary here.
+This might be a good place to talk about core concepts and how they relate.
+-->
+
+In most scenarios, users can use the label keyed with `pod-template-hash` added 
+automatically by the Deployment controller to distinguish between different 
+revisions in a single Deployment. But for more complex scenarios 
+(e.g., Pod(Anti)Affinity associating two deployments at the same time), users are 
+responsible for providing common labels to identify which pods should be grouped. 
+
+### Risks and Mitigations
+
+<!--
+What are the risks of this proposal, and how do we mitigate? Think broadly.
+For example, consider both security and how this will impact the larger
+Kubernetes ecosystem.
+
+How will security be reviewed, and by whom?
+
+How will UX be reviewed, and by whom?
+
+Consider including folks who also work outside the SIG or subproject.
+-->
+
+In addition to using `pod-template-hash` added by the Deployment controller, 
+users can also provide the customized key in  `MatchLabelKeys` to identify 
+which pods should be grouped. If so, the user needs to ensure that it is 
+correct and not duplicated with other unrelated workloads.
+
+## Design Details
+
+<!--
+This section should contain enough information that the specifics of your
+change are understandable. This may include API specs (though not always
+required) or even code snippets. If there's any ambiguity about HOW your
+proposal will be implemented, this is the place to discuss them.
+-->
+
+A new optional field `MatchLabelKeys` is introduced to `PodAffinityTerm`.
+
+```go
+type PodAffinityTerm struct {
+	LabelSelector *metav1.LabelSelector
+	Namespaces []string
+	TopologyKey string
+	NamespaceSelector *metav1.LabelSelector
+
+	// MatchLabelKeys is a set of pod label keys to select which pods will
+	// be taken into consideration. The keys are used to lookup values from the
+	// incoming pod labels, those key-value labels are ANDed with `LabelSelector`
+	// to select the group of existing pods which pods will be taken into consideration 
+	// for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming 
+	// pod labels will be ignored. The default value is empty.
+	// +optional
+	MatchLabelKeys []string
+}
+```
+
+The inter-Pod Affinity plugin will obtain the labels from the pod 
+labels by the keys in `MatchLabelKeys`. The obtained labels will be merged 
+to `LabelSelector` of `PodAffinityTerm` to filter and group pods. 
+The pods belonging to the same group will be evaluated.
+
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+The goal is to ensure that we don't accept enhancements with inadequate testing.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+[x] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+##### Unit tests
+
+<!--
+In principle every added code should have complete unit test coverage, so providing
+the exact set of tests will not bring additional value.
+However, if complete unit test coverage is not possible, explain the reason of it
+together with explanation why this is acceptable.
+-->
+
+<!--
+Additionally, for Alpha try to enumerate the core package you will be touching
+to implement this enhancement and provide the current unit coverage for those
+in the form of:
+- <package>: <date> - <current test coverage>
+The data can be easily read from:
+https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit
+
+This can inform certain test coverage improvements that we want to do before
+extending the production code to implement this enhancement.
+-->
+
+- `k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/filtering.go`: `2022-11-09 14:43 JST` (The commit hash: `20a9f7786aa4ee0b6e1619c7974ea4562d2b2500`) - `91.7%`
+- `k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/filtering.go`: `2022-11-09 14:43 JST` (The commit hash: `20a9f7786aa4ee0b6e1619c7974ea4562d2b2500`) - `87.3%`
+
+##### Integration tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+-->
+
+- These tests will be added.
+  - `MatchLabelKeys` in `PodAffinity` (both in Filter and Score) works as expected.
+  - `MatchLabelKeys` in `PodAntiAffinity` (both in Filter and Score) works as expected.
+  - `MatchLabelKeys` with the feature gate enabled/disabled.
+
+**Filter**
+- `k8s.io/kubernetes/test/integration/scheduler/filters/filters_test.go`: https://storage.googleapis.com/k8s-triage/index.html?test=TestPodTopologySpreadFilter
+
+**Score**
+- `k8s.io/kubernetes/test/integration/scheduler/scoring/priorities_test.go`: https://storage.googleapis.com/k8s-triage/index.html?test=TestPodTopologySpreadScoring
+
+Also, we should make sure this feature brings no significant performance degradation in both Filter and Score.
+
+- `k8s.io/kubernetes/test/integration/scheduler_perf/scheduler_perf_test.go`: https://storage.googleapis.com/k8s-triage/index.html?test=BenchmarkPerfScheduling
+
+##### e2e tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+-->
+
+- These e2e tests will be added.
+  - `MatchLabelKeys: ["pod-template-hash"]` in `PodAffinity` (both in Filter and Score) works as expected with a rolling upgrade scenario.
+  - `MatchLabelKeys: ["pod-template-hash"]` in `PodAntiAffinity` (both in Filter and Score) works as expected with a rolling upgrade scenario.
+
+### Graduation Criteria
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Define graduation milestones.
+
+These may be defined in terms of API maturity, [feature gate] graduations, or as
+something else. The KEP should keep this high-level with a focus on what
+signals will be looked at to determine graduation.
+
+Consider the following in developing the graduation criteria for this enhancement:
+- [Maturity levels (`alpha`, `beta`, `stable`)][maturity-levels]
+- [Feature gate][feature gate] lifecycle
+- [Deprecation policy][deprecation-policy]
+
+Clearly define what graduation means by either linking to the [API doc
+definition](https://kubernetes.io/docs/concepts/overview/kubernetes-api/#api-versioning)
+or by redefining what graduation means.
+
+In general we try to use the same stages (alpha, beta, GA), regardless of how the
+functionality is accessed.
+
+[feature gate]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[maturity-levels]: https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md#alpha-beta-and-stable-versions
+[deprecation-policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
+
+Below are some examples to consider, in addition to the aforementioned [maturity levels][maturity-levels].
+
+#### Alpha
+
+- Feature implemented behind a feature flag
+- Initial e2e tests completed and enabled
+
+#### Beta
+
+- Gather feedback from developers and surveys
+- Complete features A, B, C
+- Additional tests are in Testgrid and linked in KEP
+
+#### GA
+
+- N examples of real-world usage
+- N installs
+- More rigorous forms of testing—e.g., downgrade tests and scalability tests
+- Allowing time for feedback
+
+**Note:** Generally we also wait at least two releases between beta and
+GA/stable, because there's no opportunity for user feedback, or even bug reports,
+in back-to-back releases.
+
+**For non-optional features moving to GA, the graduation criteria must include
+[conformance tests].**
+
+[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
+
+#### Deprecation
+
+- Announce deprecation and support policy of the existing flag
+- Two versions passed since introducing the functionality that deprecates the flag (to address version skew)
+- Address feedback on usage/changed behavior, provided on GitHub issues
+- Deprecate the flag
+-->
+
+#### Alpha
+
+- Feature implemented behind a feature flag
+- Unit tests and e2e tests are implemented 
+- No significant performance degradation is observed from the benchmark test
+
+#### Beta
+
+- The feature gate is enabled by default.
+
+#### GA
+
+- No negative feedback.
+- No bug issues reported.
+
+### Upgrade / Downgrade Strategy
+
+<!--
+If applicable, how will the component be upgraded and downgraded? Make sure
+this is in the test plan.
+
+Consider the following in developing an upgrade/downgrade strategy for this
+enhancement:
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to maintain previous behavior?
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to make use of the enhancement?
+-->
+
+**Upgrade**
+
+The previous PodAffinity/PodAntiAffinity behavior will not be broken. Users can continue to use
+their Pod specs as it is.
+
+To use this enhancement, users need to enable the feature gate (during this feature is in the alpha.),
+and add `MatchLabelKeys` on their PodAffinity/PodAntiAffinity.
+
+**Downgrade**
+
+kube-apiserver will ignore `MatchLabelKeys` in PodAffinity/PodAntiAffinity,
+and thus, kube-scheduler will also do nothing with it.
+
+### Version Skew Strategy
+
+<!--
+If applicable, how will the component handle version skew with other
+components? What are the guarantees? Make sure this is in the test plan.
+
+Consider the following in developing a version skew strategy for this
+enhancement:
+- Does this enhancement involve coordinating behavior in the control plane and
+  in the kubelet? How does an n-2 kubelet without this feature available behave
+  when this feature is used?
+- Will any other components on the node change? For example, changes to CSI,
+  CRI or CNI may require updating that component before the kubelet.
+-->
+
+N/A
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+
+Documentation is available on [feature gate lifecycle] and expectations, as
+well as the [existing list] of feature gates.
+
+[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
+-->
+
+- [x] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: `MatchLabelKeysInPodAffinityAndPodAntiAffinity`
+  - Components depending on the feature gate: `kube-scheduler`, `kube-apiserver`
+- [ ] Other
+
+###### Does enabling the feature change any default behavior?
+
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+
+No.
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+Feature gates are typically disabled by setting the flag to `false` and
+restarting the component. No other changes should be necessary to disable the
+feature.
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+
+The feature can be disabled in Alpha and Beta versions
+by restarting kube-apiserver and kube-scheduler with the feature-gate off.
+In terms of Stable versions, users can choose to opt-out by not setting the
+`MatchLabelKeys` field.
+
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+Scheduling of new Pods is affected. (Scheduled Pods aren't affected.)
+
+###### Are there any tests for feature enablement/disablement?
+
+<!--
+The e2e framework does not currently support enabling or disabling feature
+gates. However, unit tests in each component dealing with managing data, created
+with and without the feature, are necessary. At the very least, think about
+conversion tests if API types are being modified.
+
+Additionally, for features that are introducing a new API field, unit tests that
+are exercising the `switch` of feature gate itself (what happens if I disable a
+feature gate after having objects written with the new field) are also critical.
+You can take a look at one potential example of such test in:
+https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
+-->
+
+No. But, the tests to confirm the behavior on switching the feature gate will be added.
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason: 
+- [ ] API .status
+  - Condition name: 
+  - Other field: 
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+ - 2022-11-09: Initial KEP PR is submitted.
+
+## Drawbacks
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
+
+## Infrastructure Needed (Optional)
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
diff --git a/keps/sig-scheduling/3633-matchlabelkeys-to-podaffinity/kep.yaml b/keps/sig-scheduling/3633-matchlabelkeys-to-podaffinity/kep.yaml
new file mode 100644
index 000000000000..17c2d8e713fb
--- /dev/null
+++ b/keps/sig-scheduling/3633-matchlabelkeys-to-podaffinity/kep.yaml
@@ -0,0 +1,29 @@
+title: Introduce MatchLabelKeys to PodAffinity and PodAntiAffinity
+kep-number: 3633
+authors:
+  - "@sanposhiho"
+owning-sig: sig-scheduling
+status: provisional
+creation-date: 2022-11-09
+reviewers:
+  - "@Huang-Wei"
+approvers:
+  - "@Huang-Wei"
+
+see-also:
+  - "/keps/sig-scheduling/3243-respect-pod-topology-spread-after-rolling-upgrades"
+
+stage: alpha
+
+latest-milestone: "v1.27"
+
+milestone:
+  alpha: "v1.27"
+  beta: "v1.28"
+  stable: "v1.30"
+
+feature-gates:
+  - name: MatchLabelKeysInPodAffinity
+    components:
+      - kube-scheduler
+disable-supported: true
diff --git a/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/README.md b/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/README.md
index a327a089c87b..313655346379 100644
--- a/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/README.md
+++ b/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/README.md
@@ -94,17 +94,23 @@ tags, and then generate with `hack/update-toc.sh`.
 - [Design Details](#design-details)
   - [Kubernetes Changes, Access Mode](#kubernetes-changes-access-mode)
     - [Scheduler Enforcement](#scheduler-enforcement)
+      - [Alpha](#alpha)
+      - [Beta](#beta)
     - [Mount Enforcement](#mount-enforcement)
   - [CSI Specification Changes, Volume Capabilities](#csi-specification-changes-volume-capabilities)
   - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
     - [Validation of PersistentVolumeSpec Object](#validation-of-persistentvolumespec-object)
     - [Mounting and Mapping with ReadWriteOncePod](#mounting-and-mapping-with-readwriteoncepod)
     - [Mounting and Mapping with ReadWriteOnce](#mounting-and-mapping-with-readwriteonce)
     - [Mapping Kubernetes Access Modes to CSI Volume Capability Access Modes](#mapping-kubernetes-access-modes-to-csi-volume-capability-access-modes)
     - [End to End Tests](#end-to-end-tests)
   - [Graduation Criteria](#graduation-criteria)
-    - [Alpha](#alpha)
-    - [Beta](#beta)
+    - [Alpha](#alpha-1)
+    - [Beta](#beta-1)
     - [GA](#ga)
   - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
   - [Version Skew Strategy](#version-skew-strategy)
@@ -112,6 +118,9 @@ tags, and then generate with `hack/update-toc.sh`.
     - [API Server Version N / Scheduler Version N-1 / Kubelet Version N-1 or N-2](#api-server-version-n--scheduler-version-n-1--kubelet-version-n-1-or-n-2)
     - [API Understands ReadWriteOncePod, CSI Sidecars Do Not](#api-understands-readwriteoncepod-csi-sidecars-do-not)
     - [CSI Controller Service Understands New CSI Access Modes, CSI Node Service Does Not](#csi-controller-service-understands-new-csi-access-modes-csi-node-service-does-not)
+    - [API Server Has Feature Enabled, Scheduler and Kubelet Do Not](#api-server-has-feature-enabled-scheduler-and-kubelet-do-not)
+    - [API Server Has Feature Enabled, Scheduler Does not, Kubelet Does](#api-server-has-feature-enabled-scheduler-does-not-kubelet-does)
+    - [API Server Has Feature Enabled, Scheduler Does, Kubelet Does Not](#api-server-has-feature-enabled-scheduler-does-kubelet-does-not)
 - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
   - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
   - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
@@ -389,6 +398,8 @@ This access mode will be enforced in two places:
 
 #### Scheduler Enforcement
 
+##### Alpha
+
 First is at the time a pod is scheduled. When scheduling a pod, if another pod
 is found using the same PVC and the PVC uses ReadWriteOncePod, then scheduling
 will fail and the pod will be considered UnschedulableAndUnresolvable.
@@ -407,6 +418,26 @@ marked UnschedulableAndUnresolvable.
 [volume restrictions plugin]: https://github.com/kubernetes/kubernetes/blob/v1.21.0/pkg/scheduler/framework/plugins/volumerestrictions/volume_restrictions.go#L29
 [node info cache]: https://github.com/kubernetes/kubernetes/blob/v1.21.0/pkg/scheduler/framework/types.go#L357
 
+##### Beta
+
+Support for pod preemption is enforced in beta.
+
+When a pod (A) with a ReadWriteOncePod PVC is scheduled, if another pod (B) is
+found using the same PVC and pod (A) has higher priority, the scheduler will
+return an "Unschedulable" status and attempt to preempt pod (B).
+
+The implementation goes like follows:
+
+In the PreFilter phase of the volume restrictions scheduler plugin, we will
+build a cache of the ReadWriteOncePod PVCs for the pod-to-be-scheduled and the
+number of conflicting PVC references (pods already using any of these PVCs).
+This cache will be saved as part of the scheduler's cycleState and forwarded to
+the following step. During AddPod and RemovePod, if there is a conflict we will
+add or subtract from the number of conflicting references. During the Filter
+phase, if the cache contains a non-zero amount of conflicting references then
+return "Unschedulable". If the pod has a PVC that cannot be found, return
+"UnschedulableAndUnresolvable".
+
 #### Mount Enforcement
 
 As an additional precaution this will also be enforced at the time a volume is
@@ -487,14 +518,7 @@ external-attacher, and external-resizer.
 
 <!--
 **Note:** *Not required until targeted at a release.*
-
-Consider the following in developing a test plan for this enhancement:
-- Will there be e2e and integration tests, in addition to unit tests?
-- How will it be tested in isolation vs with other components?
-
-No need to outline all of the test cases, just the general strategy. Anything
-that would count as tricky in the implementation, and anything particularly
-challenging to test, should be called out.
+The goal is to ensure that we don't accept enhancements with inadequate testing.
 
 All code is expected to have adequate tests (eventually with coverage
 expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
@@ -503,6 +527,119 @@ when drafting this test plan.
 [testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
 -->
 
+[X] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes
+necessary to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+None. New tests will be added for the transition to beta to support scheduler
+changes.
+
+##### Unit tests
+
+<!--
+In principle every added code should have complete unit test coverage, so providing
+the exact set of tests will not bring additional value.
+However, if complete unit test coverage is not possible, explain the reason of it
+together with explanation why this is acceptable.
+-->
+
+<!--
+Additionally, for Alpha try to enumerate the core package you will be touching
+to implement this enhancement and provide the current unit coverage for those
+in the form of:
+- <package>: <date> - <current test coverage>
+The data can be easily read from:
+https://testgrid.k8s.io/sig-testing-canaries#ci-kubernetes-coverage-unit
+
+This can inform certain test coverage improvements that we want to do before
+extending the production code to implement this enhancement.
+-->
+
+In alpha, the following unit tests were updated. See
+https://github.com/kubernetes/kubernetes/pull/102028 and
+https://github.com/kubernetes/kubernetes/pull/103082 for more context.
+
+- `k8s.io/kubernetes/pkg/apis/core/helper`: `09-22-2022` - `26.2`
+- `k8s.io/kubernetes/pkg/apis/core/v1/helper`: `09-22-2022` - `56.9`
+- `k8s.io/kubernetes/pkg/apis/core/validation`: `09-22-2022` - `82.3`
+- `k8s.io/kubernetes/pkg/controller/volume/persistentvolume`: `09-22-2022` - `79.4`
+- `k8s.io/kubernetes/pkg/kubelet/volumemanager/cache`: `09-22-2022` - `66.3`
+- `k8s.io/kubernetes/pkg/volume/csi/csi_client.go`: `09-22-2022` - `76.2`
+- `k8s.io/kubernetes/pkg/scheduler/apis/config/v1beta2`: `09-22-2022` -  `76.8`
+- `k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions`: `09-22-2022` -  `85`
+- `k8s.io/kubernetes/pkg/scheduler/framework`: `09-22-2022` -  `77.1`
+
+In beta, there will be additional unit test coverage for
+`k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions` to cover
+preemption logic.
+
+##### Integration tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+-->
+
+Integration tests for scheduler plugin behavior are available here:
+
+- [test/integration/scheduler/filters/filters_test.go] : [testgrid]
+<!-- TODO(chrishenzie): Link preemption integration test once merged -->
+
+[test/integration/scheduler/filters/filters_test.go]: https://github.com/kubernetes/kubernetes/blob/v1.25.2/test/integration/scheduler/filters/filters_test.go
+[testgrid]: https://testgrid.k8s.io/presubmits-kubernetes-blocking#pull-kubernetes-integration&include-filter-by-regex=k8s.io/kubernetes/test/integration/scheduler/filters.TestUnschedulablePodBecomesSchedulable/scheduled_pod_uses_read-write-once-pod_pvc
+
+##### e2e tests
+
+<!--
+This question should be filled when targeting a release.
+For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
+
+For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
+https://storage.googleapis.com/k8s-triage/index.html
+
+We expect no non-infra related flakes in the last month as a GA graduation criteria.
+-->
+
+For alpha, to test this feature end to end, we will need to check the following cases:
+
+- A ReadWriteOncePod volume will succeed mounting when consumed by a single pod
+  on a node
+- A ReadWriteOncePod volume will fail to mount when consumed by a second pod on
+  the same node
+- A ReadWriteOncePod volume will fail to attach when consumed by a second pod on
+  a different node
+
+For testing the mapping for ReadWriteOnce, we will update the CSI hostpath
+driver to support the new volume capability access modes and cut a release. The
+existing Kubernetes end to end tests will be updated to use this version which
+will test the K8s to CSI access mode mapping behavior because most storage end
+to end tests rely on the ReadWriteOnce access mode, which now maps to the
+SINGLE_NODE_MULTI_WRITER CSI access mode.
+
+E2E tests for alpha behavior can be found here:
+
+- [test/e2e/storage/testsuites/readwriteoncepod.go] : [k8s-triage]
+
+For beta, we will want to cover the additional cases for preemption:
+
+- A high-priority pod requesting a ReadWriteOncePod volume that's already in-use
+  will result in the preemption of the pod previously using the volume
+- A low-priority (or no priority) pod requesting a ReadWriteOncePod volume
+  that's already in-use will result in it being UnschedulableAndUnresolvable
+
+[test/e2e/storage/testsuites/readwriteoncepod.go]: https://github.com/kubernetes/kubernetes/blob/master/test/e2e/storage/testsuites/readwriteoncepod.go
+[k8s-triage]: https://storage.googleapis.com/k8s-triage/index.html?pr=1&test=read-write-once-pod
+
 #### Validation of PersistentVolumeSpec Object
 
 To test the validation logic of the PersistentVolumeSpec, we need to check the
@@ -538,20 +675,6 @@ well as in CSI sidecars.
 
 #### End to End Tests
 
-To test this feature end to end, we will need to check the following cases:
-
-- A ReadWriteOncePod volume will succeed mounting when consumed by a single pod
-  on a node
-- A ReadWriteOncePod volume will fail to mount when consumed by a second pod on
-  the same node
-- A ReadWriteOncePod volume will fail to attach when consumed by a second pod on
-  a different node
-
-For testing the mapping for ReadWriteOnce, we should update the mock CSI driver
-to support the new volume capability access modes and cut a release. The
-existing Kubernetes end to end tests will be updated to use this version which
-will test the mapping behavior because most storage end to end tests rely on the
-ReadWriteOnce access mode.
 
 ### Graduation Criteria
 
@@ -623,8 +746,6 @@ in back-to-back releases.
 - Scheduler enforces ReadWriteOncePod access mode by marking pods as
   Unschedulable, preemption logic added
 - ReadWriteOncePod access mode has end to end test coverage
-- Mock CSI driver supports `SINGLE_NODE_*_WRITER` access modes, relevant end to
-  end tests updated to use this driver
 - Hostpath CSI driver supports `SINGLE_NODE_*_WRITER` access modes, relevant end
   to end tests updated to use this driver
 
@@ -653,8 +774,11 @@ feature gate enabled.  Additionally they will need to update their CSI drivers
 and sidecars to versions that depend on the new Kubernetes API and CSI spec.
 
 When downgrading a cluster to disable this feature, the user will need to
-restart the kube-apiserver with the ReadWriteOncePod feature gate disabled. When
-disabling this feature gate, any existing volumes with the ReadWriteOncePod
+disable the ReadWriteOncePod feature gate in kube-apiserver, kube-scheduler, and
+kubelet. They may also roll back their CSI sidecars if they are encountering
+errors.
+
+When disabling this feature gate, any existing volumes with the ReadWriteOncePod
 access mode will continue to exist, but can only be deleted. An alternative is
 to allow these volumes to be treated as ReadWriteOnce, however that would
 violate the intent of the user and so it is not recommended.
@@ -735,6 +859,45 @@ node service does not understand these access modes, the behavior will depend on
 the CSI driver and how it treats unknown access modes. The recommendation is to
 upgrade the CSI drivers for the controller and node services together.
 
+#### API Server Has Feature Enabled, Scheduler and Kubelet Do Not
+
+In this scenario, the kube-scheduler will not enforce the ReadWriteOncePod
+access mode and proceed to schedule pods sharing the same ReadWriteOncePod PVCs.
+
+If you have two pods sharing the same ReadWriteOncePod PVC and they land on
+separate nodes, the volume will [only be able to attach to a single node]. The
+other pod will be stuck because the volume is already attached elsewhere.
+
+However, if both pods land on the same node, kubelet will not enforce the access
+mode and allow both pods to mount the same ReadWriteOncePod volume.
+
+[only be able to attach to a single node]: https://github.com/kubernetes/kubernetes/blob/v1.26.1/pkg/volume/util/util.go#L716-L718
+
+#### API Server Has Feature Enabled, Scheduler Does not, Kubelet Does
+
+In this scenario, the kube-scheduler will not enforce the ReadWriteOncePod
+access mode and proceed to schedule pods sharing the same ReadWriteOncePod PVCs.
+
+If you have two pods sharing the same ReadWriteOncePod PVC and they land on
+separate nodes, the volume will [only be able to attach to a single node]. The
+other pod will be stuck because the volume is already attached elsewhere.
+
+If both pods land on the same node, kubelet will enforce the access mode and
+only allow one pod to mount the volume.
+
+[only be able to be attached to a single node]: https://github.com/kubernetes/kubernetes/blob/v1.26.1/pkg/volume/util/util.go#L716-L718
+
+#### API Server Has Feature Enabled, Scheduler Does, Kubelet Does Not
+
+In this scenario, the kube-scheduler will enforce the ReadWriteOncePod access
+mode and ensure only a single pod may use a ReadWriteOncePod PVC.
+
+If you have two pods sharing the same ReadWriteOncePod PVC and they both have
+`spec.nodeName` set, then scheduling will be bypassed. See [the above scenario]
+on the expected behavior.
+
+[the above scenario]: #api-server-has-feature-enabled-scheduler-and-kubelet-do-not
+
 ## Production Readiness Review Questionnaire
 
 <!--
@@ -832,6 +995,16 @@ Try to be as paranoid as possible - e.g., what if some components will restart
 mid-rollout?
 -->
 
+Rolling out this feature involves enabling the ReadWriteOncePod feature gate
+across kube-apiserver, kube-scheduler, kubelet, and updating CSI driver and
+sidecar versions. The order in which these are performed does not matter.
+
+The only way this rollout can fail is if a user does not update all components,
+in which case the feature will not work. See the above section on version skews
+for behavior in this scenario.
+
+Rolling out this feature does not impact any running workloads.
+
 ###### What specific metrics should inform a rollback?
 
 <!--
@@ -839,6 +1012,14 @@ What signals should users be paying attention to when the feature is young
 that might indicate a serious problem?
 -->
 
+If pods using ReadWriteOncePod PVCs fail to schedule, you may see an increase in
+`scheduler_unschedulable_pods{plugin="VolumeRestrictions"}`.
+
+For enforcement in kubelet, if there are issues you may see changes in metrics
+for "volume_mount" operations. For example, an increase in
+`storage_operation_duration_seconds_bucket{operation_name="volume_mount"}` for
+larger buckets may indicate issues with mount.
+
 ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
 
 <!--
@@ -847,12 +1028,46 @@ Longer term, we may want to require automated upgrade/rollback tests, but we
 are missing a bunch of machinery and tooling and can't do that now.
 -->
 
+Manual tests were performed to test the whole end to end flow.
+
+Starting with the upgrade path:
+
+- Unsuccessfully create workloads using ReadWriteOncePod PVCs prior to upgrade
+- Perform the upgrade in two stages:
+  - First, update CSI sidecars
+  - Second, enable feature flag
+- Successfully create workloads (1) and (2) using ReadWriteOncePod PVCs (1) and
+  (2).
+- Unsuccessfully create workload (3) using ReadWriteOncePod PVC (2) (already
+  in-use)
+- Observe the workloads and PVCs are healthy
+
+For the downgrade path:
+
+- Perform the downgrade in two stages:
+  - First, disable feature flag
+  - Second, downgrade CSI sidecars
+- Observe the workloads and PVCs are still healthy
+- Successfully delete workload (1) and ReadWriteOncePod PVC (1)
+
+And re-upgrading the feature again:
+
+- Perform the upgrade in two stages:
+  - First, update CSI sidecars
+  - Second, enable feature flag
+- Successfully create workload (1) using ReadWriteOncePod PVC (1)
+- Unsuccessfully create workload (3) using ReadWriteOncePod PVC (2) (already
+  in-use)
+- Observe the workloads and PVCs are still healthy
+
 ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
 
 <!--
 Even if applying deprecation policies, they may still surprise some users.
 -->
 
+No.
+
 ### Monitoring Requirements
 
 <!--
@@ -867,18 +1082,21 @@ checking if there are objects with field X set) may be a last resort. Avoid
 logs or events for this purpose.
 -->
 
+An operator can query for PersistentVolumeClaims and PersistentVolumes in the
+cluster with the ReadWriteOncePod access mode. If any exist then the feature is
+in use.
+
 ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
 
 <!--
 Pick one more of these and delete the rest.
 -->
 
-- [ ] Metrics
-  - Metric name:
+- [X] Metrics
+  - Metric name: `scheduler_unschedulable_pods{plugin="VolumeRestrictions"}`
   - [Optional] Aggregation method:
   - Components exposing the metric:
-- [ ] Other (treat as last resort)
-  - Details:
+    - kube-scheduler
 
 ###### What are the reasonable SLOs (Service Level Objectives) for the above SLIs?
 
@@ -892,6 +1110,23 @@ high level (needs more precise definitions) those may be things like:
   - 99,9% of /health requests per day finish with 200 code
 -->
 
+Defining an SLO for this metric is difficult because a pod may be
+"Unschedulable" due to user error; they mistakenly scheduled a workload that has
+volume conflicts. Additionally, this metric captures volume conflicts for some
+legacy in-tree drivers that do not support the ReadWriteOncePod feature but are
+part of the same scheduler plugin.
+
+Any unexpected increases in
+`scheduler_unschedulable_pods{plugin="VolumeRestrictions"}` should be
+investigated by checking the status of pods failing scheduling.
+
+If there are failures during attach, detach, mount, or unmount, you may see an
+increase in the `storage_operation_duration_seconds` metric exported by
+kubelet.
+
+You may also see an increase in the `csi_sidecar_operations_seconds_bucket`
+metric exported by CSI sidecars if there are issues performing CSI operations.
+
 ###### Are there any missing metrics that would be useful to have to improve observability of this feature?
 
 <!--
@@ -899,6 +1134,8 @@ Describe the metrics themselves and the reasons why they weren't added (e.g., co
 implementation difficulties, etc.).
 -->
 
+No.
+
 ### Dependencies
 
 <!--
@@ -922,6 +1159,18 @@ and creating new ones, as well as about cluster-level services (e.g. DNS):
       - Impact of its degraded performance or high-error rates on the feature:
 -->
 
+This feature depends on the cluster having CSI drivers and sidecars that use CSI
+spec v1.5.0 at minimum.
+
+- [CSI drivers and sidecars]
+  - Usage description:
+    - Impact of its outage on the feature: Inability to perform CSI storage
+      operations on ReadWriteOncePod PVCs and PVs (for example, provisioning
+      volumes)
+    - Impact of its degraded performance or high-error rates on the feature:
+      Increase in latency performing CSI storage operations (due to repeated
+      retries)
+
 ### Scalability
 
 <!--
@@ -1026,6 +1275,9 @@ details). For now, we leave it here.
 
 ###### How does this feature react if the API server and/or etcd is unavailable?
 
+Existing ReadWriteOncePod volumes will continue working, however users will not
+be able to make any changes to them.
+
 ###### What are other known failure modes?
 
 <!--
@@ -1041,8 +1293,14 @@ For each of them, fill in the following information by copying the below templat
     - Testing: Are there any tests for failure mode? If not, describe why.
 -->
 
+None.
+
 ###### What steps should be taken if SLOs are not being met to determine the problem?
 
+- Delete any unhealthy pods / PVCs using ReadWriteOncePod
+- Disable the feature gate (target the API server first to prevent creation of new PVCs)
+- Downgrade CSI sidecars and drivers if you're seeing elevated errors there
+
 ## Implementation History
 
 <!--
diff --git a/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/kep.yaml b/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/kep.yaml
index ffae0b29d490..f6563cfd51b7 100644
--- a/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/kep.yaml
+++ b/keps/sig-storage/2485-read-write-once-pod-pv-access-mode/kep.yaml
@@ -20,17 +20,17 @@ see-also:
 replaces:
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: alpha
+stage: beta
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.22"
+latest-milestone: "v1.27"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   alpha: "v1.22"
-  beta: TBD
+  beta: "v1.27"
   stable: TBD
 
 # The following PRR answers are required at alpha release
@@ -45,3 +45,4 @@ disable-supported: true
 
 # The following PRR answers are required at beta release
 metrics:
+  - scheduler_unschedulable_pods{plugin="VolumeRestrictions"}