diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index dfc7f6804..7f780e683 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,12 +13,12 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - run: echo "$(go env GOPATH)/bin" >> $GITHUB_PATH - run: go install mvdan.cc/sh/v3/cmd/shfmt@latest - run: make lint test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - run: make test diff --git a/.github/workflows/stale-issues.yaml b/.github/workflows/stale-issues.yaml deleted file mode 100644 index a56181160..000000000 --- a/.github/workflows/stale-issues.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: 'Close stale issues' -on: - schedule: - # once a day at noon - - cron: '0 12 * * *' -permissions: - issues: write -jobs: - stale: - runs-on: ubuntu-latest - steps: - - uses: actions/stale@v6 - with: - days-before-stale: 90 - days-before-close: 14 - stale-issue-message: 'Please update this issue if it applies to the latest AMI release; otherwise it will be closed soon.' - stale-issue-label: 'stale' - exempt-issue-labels: 'never-stale' - # empty message will prevent PR's from being staled - stale-pr-message: '' - debug-only: true \ No newline at end of file diff --git a/.github/workflows/sync-eni-max-pods.yaml b/.github/workflows/sync-eni-max-pods.yaml index 76f02addf..9bb3275bc 100644 --- a/.github/workflows/sync-eni-max-pods.yaml +++ b/.github/workflows/sync-eni-max-pods.yaml @@ -14,7 +14,7 @@ jobs: if: github.repository == 'awslabs/amazon-eks-ami' runs-on: ubuntu-latest steps: - - uses: aws-actions/configure-aws-credentials@v1 + - uses: aws-actions/configure-aws-credentials@v2 with: aws-region: ${{ secrets.AWS_REGION }} role-to-assume: ${{ secrets.AWS_ROLE_ARN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 44e28e823..c316c3b97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,426 @@ # Changelog +### AMI Release v20230703 +* amazon-eks-gpu-node-1.27-v20230703 +* amazon-eks-gpu-node-1.26-v20230703 +* amazon-eks-gpu-node-1.25-v20230703 +* amazon-eks-gpu-node-1.24-v20230703 +* amazon-eks-gpu-node-1.23-v20230703 +* amazon-eks-gpu-node-1.22-v20230703 +* amazon-eks-arm64-node-1.27-v20230703 +* amazon-eks-arm64-node-1.26-v20230703 +* amazon-eks-arm64-node-1.25-v20230703 +* amazon-eks-arm64-node-1.24-v20230703 +* amazon-eks-arm64-node-1.23-v20230703 +* amazon-eks-arm64-node-1.22-v20230703 +* amazon-eks-node-1.27-v20230703 +* amazon-eks-node-1.26-v20230703 +* amazon-eks-node-1.25-v20230703 +* amazon-eks-node-1.24-v20230703 +* amazon-eks-node-1.23-v20230703 +* amazon-eks-node-1.22-v20230703 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.27.1-20230703` +* `1.26.4-20230703` +* `1.25.9-20230703` +* `1.24.13-20230703` +* `1.23.17-20230703` +* `1.22.17-20230703` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.27.1/2023-04-19/ +* s3://amazon-eks/1.26.4/2023-05-11/ +* s3://amazon-eks/1.25.9/2023-05-11/ +* s3://amazon-eks/1.24.13/2023-05-11/ +* s3://amazon-eks/1.23.17/2023-05-11/ +* s3://amazon-eks/1.22.17/2023-05-11/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.247-162.350.amzn2 + * Kubernetes 1.24 and above: 5.10.184-175.731.amzn2 +* `dockerd`: 20.10.23-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.5-1.amzn2 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0-1.amzn2 + +Notable changes: +- This is the last AMI release for Kubernetes 1.22 +- Update Kernel to 5.4.247-162.350.amzn2 to address [ALASKERNEL-5.4-2023-048](https://alas.aws.amazon.com/AL2/ALASKERNEL-5.4-2023-048.html), [CVE-2023-1206](https://alas.aws.amazon.com/cve/html/CVE-2023-1206.html) +- Update Kernel to 5.10.184-175.731.amzn2 to address [ALASKERNEL-5.10-2023-035](https://alas.aws.amazon.com/AL2/ALASKERNEL-5.10-2023-035.html), [CVE-2023-1206](https://alas.aws.amazon.com/cve/html/CVE-2023-1206.html) +- Use recommended clocksources ([#1328](https://github.com/awslabs/amazon-eks-ami/pull/1328)) +- Add configurable working directory ([#1231](https://github.com/awslabs/amazon-eks-ami/pull/1231)) +- Update eni-max-pods.txt ([#1330](https://github.com/awslabs/amazon-eks-ami/pull/1330)) +- Mount bpffs by default on 1.25+ ([#1320](https://github.com/awslabs/amazon-eks-ami/pull/1320)) + +### AMI Release v20230607 +* amazon-eks-gpu-node-1.27-v20230607 +* amazon-eks-gpu-node-1.26-v20230607 +* amazon-eks-gpu-node-1.25-v20230607 +* amazon-eks-gpu-node-1.24-v20230607 +* amazon-eks-gpu-node-1.23-v20230607 +* amazon-eks-gpu-node-1.22-v20230607 +* amazon-eks-arm64-node-1.27-v20230607 +* amazon-eks-arm64-node-1.26-v20230607 +* amazon-eks-arm64-node-1.25-v20230607 +* amazon-eks-arm64-node-1.24-v20230607 +* amazon-eks-arm64-node-1.23-v20230607 +* amazon-eks-arm64-node-1.22-v20230607 +* amazon-eks-node-1.27-v20230607 +* amazon-eks-node-1.26-v20230607 +* amazon-eks-node-1.25-v20230607 +* amazon-eks-node-1.24-v20230607 +* amazon-eks-node-1.23-v20230607 +* amazon-eks-node-1.22-v20230607 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.27.1-20230607` +* `1.26.4-20230607` +* `1.25.9-20230607` +* `1.24.13-20230607` +* `1.23.17-20230607` +* `1.22.17-20230607` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.27.1/2023-04-19/ +* s3://amazon-eks/1.26.4/2023-05-11/ +* s3://amazon-eks/1.25.9/2023-05-11/ +* s3://amazon-eks/1.24.13/2023-05-11/ +* s3://amazon-eks/1.23.17/2023-05-11/ +* s3://amazon-eks/1.22.17/2023-05-11/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.242-156.349.amzn2 + * Kubernetes 1.24 and above: 5.10.179-168.710.amzn2 +* `dockerd`: 20.10.23-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.5-1.amzn2 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0-1.amzn2 + +Notable changes: +* `5.4` kernel update to `5.4.242-156.349.amzn2` and `5.10` kernel update to `5.10.179-168.710.amzn2` address [CVE-2023-32233](https://alas.aws.amazon.com/cve/html/CVE-2023-32233.html) +* Updating `runc` version to `1.1.5-1.amzn2` which contains fixes for [CVE-2023-28642](https://explore.alas.aws.amazon.com/CVE-2023-27561.html) and [CVE-2023-27561](https://explore.alas.aws.amazon.com/CVE-2023-28642.html). + +### AMI Release v20230526 +* amazon-eks-gpu-node-1.27-v20230526 +* amazon-eks-gpu-node-1.26-v20230526 +* amazon-eks-gpu-node-1.25-v20230526 +* amazon-eks-gpu-node-1.24-v20230526 +* amazon-eks-gpu-node-1.23-v20230526 +* amazon-eks-gpu-node-1.22-v20230526 +* amazon-eks-arm64-node-1.27-v20230526 +* amazon-eks-arm64-node-1.26-v20230526 +* amazon-eks-arm64-node-1.25-v20230526 +* amazon-eks-arm64-node-1.24-v20230526 +* amazon-eks-arm64-node-1.23-v20230526 +* amazon-eks-arm64-node-1.22-v20230526 +* amazon-eks-node-1.27-v20230526 +* amazon-eks-node-1.26-v20230526 +* amazon-eks-node-1.25-v20230526 +* amazon-eks-node-1.24-v20230526 +* amazon-eks-node-1.23-v20230526 +* amazon-eks-node-1.22-v20230526 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.27.1-20230526` +* `1.26.4-20230526` +* `1.25.9-20230526` +* `1.24.13-20230526` +* `1.23.17-20230526` +* `1.22.17-20230526` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.27.1/2023-04-19/ +* s3://amazon-eks/1.26.4/2023-05-11/ +* s3://amazon-eks/1.25.9/2023-05-11/ +* s3://amazon-eks/1.24.13/2023-05-11/ +* s3://amazon-eks/1.23.17/2023-05-11/ +* s3://amazon-eks/1.22.17/2023-05-11/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.242-155.348.amzn2 + * Kubernetes 1.24 and above: 5.10.179-166.674.amzn2 +* `dockerd`: 20.10.23-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.4-1.amzn2 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0-1.amzn2 + +Notable changes: +* `5.4` kernel update to `5.4.242-155.348.amzn2` addresses CVE [ALAS2KERNEL-5.4-2023-045](https://alas.aws.amazon.com/AL2/ALASKERNEL-5.4-2023-045.html) +* `5.10` kernel update to `5.10.179-166.674.amzn2` addresses [ALAS2KERNEL-5.10-2023-032](https://alas.aws.amazon.com/AL2/ALASKERNEL-5.10-2023-032.html) +* `Glib` update to `glib2-2.56.1-9.amzn2` addresses [ALAS-2023-2049](https://alas.aws.amazon.com/AL2/ALAS-2023-2049.html) + +### AMI Release v20230513 +* amazon-eks-gpu-node-1.27-v20230513 +* amazon-eks-gpu-node-1.26-v20230513 +* amazon-eks-gpu-node-1.25-v20230513 +* amazon-eks-gpu-node-1.24-v20230513 +* amazon-eks-gpu-node-1.23-v20230513 +* amazon-eks-gpu-node-1.22-v20230513 +* amazon-eks-arm64-node-1.27-v20230513 +* amazon-eks-arm64-node-1.26-v20230513 +* amazon-eks-arm64-node-1.25-v20230513 +* amazon-eks-arm64-node-1.24-v20230513 +* amazon-eks-arm64-node-1.23-v20230513 +* amazon-eks-arm64-node-1.22-v20230513 +* amazon-eks-node-1.27-v20230513 +* amazon-eks-node-1.26-v20230513 +* amazon-eks-node-1.25-v20230513 +* amazon-eks-node-1.24-v20230513 +* amazon-eks-node-1.23-v20230513 +* amazon-eks-node-1.22-v20230513 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.27.1-20230513` +* `1.26.4-20230513` +* `1.25.9-20230513` +* `1.24.13-20230513` +* `1.23.17-20230513` +* `1.22.17-20230513` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.27.1/2023-04-19/ +* s3://amazon-eks/1.26.4/2023-05-11/ +* s3://amazon-eks/1.25.9/2023-05-11/ +* s3://amazon-eks/1.24.13/2023-05-11/ +* s3://amazon-eks/1.23.17/2023-05-11/ +* s3://amazon-eks/1.22.17/2023-05-11/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.241-150.347.amzn2 + * Kubernetes 1.24 and above: 5.10.178-162.673.amzn2 +* `dockerd`: 20.10.23-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.4-1.amzn2 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0-1.amzn2 + +Notable changes: + - Add support for Kubernetes 1.27 ([#1300](https://github.com/awslabs/amazon-eks-ami/pull/1300)) + +Other changes: + - Updated max pods for i4g instance types ([#1296](https://github.com/awslabs/amazon-eks-ami/commit/0de475c5f802acd470d9a2f1fdd521b7949a25ec)) + +### AMI Release v20230509 +* amazon-eks-gpu-node-1.26-v20230509 +* amazon-eks-gpu-node-1.25-v20230509 +* amazon-eks-gpu-node-1.24-v20230509 +* amazon-eks-gpu-node-1.23-v20230509 +* amazon-eks-gpu-node-1.22-v20230509 +* amazon-eks-arm64-node-1.26-v20230509 +* amazon-eks-arm64-node-1.25-v20230509 +* amazon-eks-arm64-node-1.24-v20230509 +* amazon-eks-arm64-node-1.23-v20230509 +* amazon-eks-arm64-node-1.22-v20230509 +* amazon-eks-node-1.26-v20230509 +* amazon-eks-node-1.25-v20230509 +* amazon-eks-node-1.24-v20230509 +* amazon-eks-node-1.23-v20230509 +* amazon-eks-node-1.22-v20230509 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.26.2-20230509` +* `1.25.7-20230509` +* `1.24.11-20230509` +* `1.23.17-20230509` +* `1.22.17-20230509` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.26.2/2023-03-17/ +* s3://amazon-eks/1.25.7/2023-03-17/ +* s3://amazon-eks/1.24.11/2023-03-17/ +* s3://amazon-eks/1.23.17/2023-03-17/ +* s3://amazon-eks/1.22.17/2023-03-17/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.241-150.347.amzn2 + * Kubernetes 1.24 and above: 5.10.178-162.673.amzn2 +* `dockerd`: 20.10.23-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.4-1.amzn2 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0-1.amzn2 + +Notable changes: +- The new AMIs have updated docker version 20.10.23-1.amzn2.0.1 that addresses two docker CVEs; [CVE-2022-36109 - docker](https://alas.aws.amazon.com/cve/html/CVE-2022-36109.html) and [CVE-2022-37708 - docker](https://alas.aws.amazon.com/cve/html/CVE-2022-37708.html). +- For the GPU Variants of these AMIs, the Nvidia Fabric Manager version is upgraded from 470.161.03-1 to 470.182.03-1. +- Fix ECR pattern for aws-cn ([#1280](https://github.com/awslabs/amazon-eks-ami/pull/1280)) +- Fix imds setting for multiple enis on ipv6 ([1275](https://github.com/awslabs/amazon-eks-ami/pull/1275)) + +### AMI Release v20230501 +* amazon-eks-gpu-node-1.26-v20230501 +* amazon-eks-gpu-node-1.25-v20230501 +* amazon-eks-gpu-node-1.24-v20230501 +* amazon-eks-gpu-node-1.23-v20230501 +* amazon-eks-gpu-node-1.22-v20230501 +* amazon-eks-arm64-node-1.26-v20230501 +* amazon-eks-arm64-node-1.25-v20230501 +* amazon-eks-arm64-node-1.24-v20230501 +* amazon-eks-arm64-node-1.23-v20230501 +* amazon-eks-arm64-node-1.22-v20230501 +* amazon-eks-node-1.26-v20230501 +* amazon-eks-node-1.25-v20230501 +* amazon-eks-node-1.24-v20230501 +* amazon-eks-node-1.23-v20230501 +* amazon-eks-node-1.22-v20230501 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.26.2-20230501` +* `1.25.7-20230501` +* `1.24.11-20230501` +* `1.23.17-20230501` +* `1.22.17-20230501` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.26.2/2023-03-17/ +* s3://amazon-eks/1.25.7/2023-03-17/ +* s3://amazon-eks/1.24.11/2023-03-17/ +* s3://amazon-eks/1.23.17/2023-03-17/ +* s3://amazon-eks/1.22.17/2023-03-17/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.241-150.347.amzn2 + * Kubernetes 1.24 and above: 5.10.178-162.673.amzn2 +* `dockerd`: 20.10.17-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.4-1.amzn2 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0-1.amzn2 + +Notable changes: +- Add bootstrap option to create a local NVMe raid0 or individual volume mounts ([#1171](https://github.com/awslabs/amazon-eks-ami/pull/1171)) +- Improve bootstrap logging ([#1276](https://github.com/awslabs/amazon-eks-ami/pull/1276)) +- Use credential provider API v1 in 1.27+, v1alpha1 in 1.26- ([#1269](https://github.com/awslabs/amazon-eks-ami/pull/1269)) +- Override hostname to match EC2's PrivateDnsName ([#1264](https://github.com/awslabs/amazon-eks-ami/pull/1264)) +- Add ethtool ([#1261](https://github.com/awslabs/amazon-eks-ami/pull/1261)) +- Update `kernel-5.10` for [ALASKERNEL-5.10-2023-031](https://alas.aws.amazon.com/AL2/ALASKERNEL-5.10-2023-031.html) +- Kernel version upgrade to `5.10.178-162.673.amzn2` fixes the [Containers failing to create and probe exec errors related to seccomp on recent kernel-5.10 versions](https://github.com/awslabs/amazon-eks-ami/issues/1219) issue + + +### AMI Release v20230411 +* amazon-eks-gpu-node-1.26-v20230411 +* amazon-eks-gpu-node-1.25-v20230411 +* amazon-eks-gpu-node-1.24-v20230411 +* amazon-eks-gpu-node-1.23-v20230411 +* amazon-eks-gpu-node-1.22-v20230411 +* amazon-eks-arm64-node-1.26-v20230411 +* amazon-eks-arm64-node-1.25-v20230411 +* amazon-eks-arm64-node-1.24-v20230411 +* amazon-eks-arm64-node-1.23-v20230411 +* amazon-eks-arm64-node-1.22-v20230411 +* amazon-eks-node-1.26-v20230411 +* amazon-eks-node-1.25-v20230411 +* amazon-eks-node-1.24-v20230411 +* amazon-eks-node-1.23-v20230411 +* amazon-eks-node-1.22-v20230411 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.26.2-20230411` +* `1.25.7-20230411` +* `1.24.11-20230411` +* `1.23.17-20230411` +* `1.22.17-20230411` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.26.2/2023-03-17/ +* s3://amazon-eks/1.25.7/2023-03-17/ +* s3://amazon-eks/1.24.11/2023-03-17/ +* s3://amazon-eks/1.23.17/2023-03-17/ +* s3://amazon-eks/1.22.17/2023-03-17/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.238-148.347.amzn2 + * Kubernetes 1.24 and above: 5.10.176-157.645.amzn2 +* `dockerd`: 20.10.17-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.4 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0 + +Notable changes: +- The AMI changes include update for 5.4 kernel version from `5.4.238-148.346.amzn2` to `kernel-5.4.238-148.347.amzn2`. `kernel-5.4.238-148.346` had a fatal issue affecting SMB mounts in which a null pointer dereference caused a panic. As a result, this package was removed from the Amazon Linux 2 repositories. + +### AMI Release v20230406 +* amazon-eks-gpu-node-1.26-v20230406 +* amazon-eks-gpu-node-1.25-v20230406 +* amazon-eks-gpu-node-1.24-v20230406 +* amazon-eks-gpu-node-1.23-v20230406 +* amazon-eks-gpu-node-1.22-v20230406 +* amazon-eks-arm64-node-1.26-v20230406 +* amazon-eks-arm64-node-1.25-v20230406 +* amazon-eks-arm64-node-1.24-v20230406 +* amazon-eks-arm64-node-1.23-v20230406 +* amazon-eks-arm64-node-1.22-v20230406 +* amazon-eks-node-1.26-v20230406 +* amazon-eks-node-1.25-v20230406 +* amazon-eks-node-1.24-v20230406 +* amazon-eks-node-1.23-v20230406 +* amazon-eks-node-1.22-v20230406 + +[Release versions](https://docs.aws.amazon.com/eks/latest/userguide/eks-linux-ami-versions.html) for these AMIs: +* `1.26.2-20230406` +* `1.25.7-20230406` +* `1.24.11-20230406` +* `1.23.17-20230406` +* `1.22.17-20230406` + +Binaries used to build these AMIs are published: +* s3://amazon-eks/1.26.2/2023-03-17/ +* s3://amazon-eks/1.25.7/2023-03-17/ +* s3://amazon-eks/1.24.11/2023-03-17/ +* s3://amazon-eks/1.23.17/2023-03-17/ +* s3://amazon-eks/1.22.17/2023-03-17/ + +AMI details: +* `kernel`: + * Kubernetes 1.23 and below: 5.4.238-148.346.amzn2 + * Kubernetes 1.24 and above: 5.10.173-154.642.amzn2 +* `dockerd`: 20.10.17-1.amzn2.0.1 + * **Note** that Docker is not installed on AMI's with Kubernetes 1.25+. +* `containerd`: 1.6.19-1.amzn2.0.1 +* `runc`: 1.1.4 +* `cuda`: 11.4.0-1 +* `nvidia-container-runtime-hook`: 1.4.0-1.amzn2 +* `amazon-ssm-agent`: 3.1.1732.0 + +Notable changes: +- Add support for Kubernetes 1.26 ([#1246](https://github.com/awslabs/amazon-eks-ami/pull/1246)) +- Add support `inf2`, `trn1n` instance types ([#1251](https://github.com/awslabs/amazon-eks-ami/pull/1251)) +- Updated `containerd` to address: + - [ALASDOCKER-2023-023](https://alas.aws.amazon.com/AL2/ALASDOCKER-2023-023.html) +- Fixed `ecr-credential-provider` flags not being passed correctly to `kubelet` ([#1240](https://github.com/awslabs/amazon-eks-ami/pull/1240)) + - Added `--image-credential-provider-config` and `--image-credential-provider-bin-dir` flags to the `systemd` units. + - Set `KubeletCredentialProviders` feature flag to `true` in the `kubelet` JSON config. + +Other changes: +- Use `gp3 volume_type` for 1.27+ ([#1197](https://github.com/awslabs/amazon-eks-ami/pull/1197)) +- Use default kubelet API QPS for 1.27+ ([#1241](https://github.com/awslabs/amazon-eks-ami/pull/1241)) +- Remove `--container-runtime` kubelet flag for 1.27+ ([#1250](https://github.com/awslabs/amazon-eks-ami/pull/1250)) + ### AMI Release v20230322 * amazon-eks-gpu-node-1.25-v20230322 * amazon-eks-gpu-node-1.24-v20230322 diff --git a/Makefile b/Makefile index 9b8d6e952..b070995ad 100644 --- a/Makefile +++ b/Makefile @@ -1,34 +1,51 @@ +MAKEFILE_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +PACKER_DEFAULT_VARIABLE_FILE ?= $(MAKEFILE_DIR)/eks-worker-al2-variables.json +PACKER_TEMPLATE_FILE ?= $(MAKEFILE_DIR)/eks-worker-al2.json PACKER_BINARY ?= packer -AVAILABLE_PACKER_VARIABLES := $(shell $(PACKER_BINARY) inspect -machine-readable eks-worker-al2.json | grep 'template-variable' | awk -F ',' '{print $$4}') +AVAILABLE_PACKER_VARIABLES := $(shell $(PACKER_BINARY) inspect -machine-readable $(PACKER_TEMPLATE_FILE) | grep 'template-variable' | awk -F ',' '{print $$4}') K8S_VERSION_PARTS := $(subst ., ,$(kubernetes_version)) K8S_VERSION_MINOR := $(word 1,${K8S_VERSION_PARTS}).$(word 2,${K8S_VERSION_PARTS}) -MAKEFILE_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) +# expands to 'true' if PACKER_VARIABLE_FILE is non-empty +# and the file contains the string passed as the first argument +# otherwise, expands to 'false' +packer_variable_file_contains = $(if $(PACKER_VARIABLE_FILE),$(shell grep -Fq $1 $(PACKER_VARIABLE_FILE) && echo true || echo false),false) + +# expands to 'true' if the version comparison is affirmative +# otherwise expands to 'false' +vercmp = $(shell $(MAKEFILE_DIR)/files/bin/vercmp "$1" "$2" "$3") # Docker is not present on 1.25+ AMI's -ifeq ($(shell $(MAKEFILE_DIR)/files/bin/vercmp "$(kubernetes_version)" gteq "1.25.0"), true) -# do not tag the AMI with the Docker version -docker_version ?= none -# do not include the Docker version in the AMI description -ami_component_description ?= (k8s: {{ user `kubernetes_version` }}, containerd: {{ user `containerd_version` }}) +# TODO: remove this when 1.24 reaches EOL +ifeq ($(call vercmp,$(kubernetes_version),gteq,1.25.0), true) + # do not tag the AMI with the Docker version + docker_version ?= none + # do not include the Docker version in the AMI description + ami_component_description ?= (k8s: {{ user `kubernetes_version` }}, containerd: {{ user `containerd_version` }}) +endif + +OS= +ifneq (,$(findstring al2023, $(PACKER_TEMPLATE_FILE))) + OS=-al2023 endif arch ?= x86_64 ifeq ($(arch), arm64) -instance_type ?= m6g.large -ami_name ?= amazon-eks-arm64-node-$(K8S_VERSION_MINOR)-v$(shell date +'%Y%m%d') + instance_type ?= m6g.large + ami_name ?= amazon-eks-arm64-node$(OS)-$(K8S_VERSION_MINOR)-v$(shell date +'%Y%m%d') else -instance_type ?= m4.large -ami_name ?= amazon-eks-node-$(K8S_VERSION_MINOR)-v$(shell date +'%Y%m%d') + instance_type ?= m5.large + ami_name ?= amazon-eks-node$(OS)-$(K8S_VERSION_MINOR)-v$(shell date +'%Y%m%d') endif ifeq ($(aws_region), cn-northwest-1) -source_ami_owners ?= 141808717104 + source_ami_owners ?= 141808717104 endif ifeq ($(aws_region), us-gov-west-1) -source_ami_owners ?= 045324592363 + source_ami_owners ?= 045324592363 endif T_RED := \e[0;31m @@ -37,7 +54,7 @@ T_YELLOW := \e[0;33m T_RESET := \e[0m .PHONY: latest -latest: 1.26 ## Build EKS Optimized AL2 AMI with the latest supported version of Kubernetes +latest: 1.27 ## Build EKS Optimized AL2 AMI with the latest supported version of Kubernetes # ensure that these flags are equivalent to the rules in the .editorconfig SHFMT_FLAGS := --list \ @@ -49,7 +66,7 @@ SHFMT_FLAGS := --list \ SHFMT_COMMAND := $(shell which shfmt) ifeq (, $(SHFMT_COMMAND)) -SHFMT_COMMAND = docker run --rm -v $(MAKEFILE_DIR):$(MAKEFILE_DIR) mvdan/shfmt + SHFMT_COMMAND = docker run --rm -v $(MAKEFILE_DIR):$(MAKEFILE_DIR) mvdan/shfmt endif .PHONY: fmt @@ -58,10 +75,16 @@ fmt: ## Format the source files SHELLCHECK_COMMAND := $(shell which shellcheck) ifeq (, $(SHELLCHECK_COMMAND)) -SHELLCHECK_COMMAND = docker run --rm -v $(MAKEFILE_DIR):$(MAKEFILE_DIR) koalaman/shellcheck:stable + SHELLCHECK_COMMAND = docker run --rm -v $(MAKEFILE_DIR):$(MAKEFILE_DIR) koalaman/shellcheck:stable endif SHELL_FILES := $(shell find $(MAKEFILE_DIR) -type f -name '*.sh') +.PHONY: transform-al2-to-al2023 +transform-al2-to-al2023: + PACKER_TEMPLATE_FILE=$(PACKER_TEMPLATE_FILE) \ + PACKER_DEFAULT_VARIABLE_FILE=$(PACKER_DEFAULT_VARIABLE_FILE) \ + hack/transform-al2-to-al2023.sh + .PHONY: lint lint: ## Check the source files for syntax and format issues $(SHFMT_COMMAND) $(SHFMT_FLAGS) --diff $(MAKEFILE_DIR) @@ -73,41 +96,41 @@ test: ## run the test-harness # include only variables which have a defined value PACKER_VARIABLES := $(foreach packerVar,$(AVAILABLE_PACKER_VARIABLES),$(if $($(packerVar)),$(packerVar))) -PACKER_VAR_FLAGS := -var-file eks-worker-al2-variables.json \ -$(if $(PACKER_VARIABLE_FILE),--var-file=$(PACKER_VARIABLE_FILE),) \ +PACKER_VAR_FLAGS := -var-file $(PACKER_DEFAULT_VARIABLE_FILE) \ +$(if $(PACKER_VARIABLE_FILE),-var-file=$(PACKER_VARIABLE_FILE),) \ $(foreach packerVar,$(PACKER_VARIABLES),-var $(packerVar)='$($(packerVar))') .PHONY: validate validate: ## Validate packer config - $(PACKER_BINARY) validate $(PACKER_VAR_FLAGS) eks-worker-al2.json + $(PACKER_BINARY) validate $(PACKER_VAR_FLAGS) $(PACKER_TEMPLATE_FILE) .PHONY: k8s k8s: validate ## Build default K8s version of EKS Optimized AL2 AMI @echo "$(T_GREEN)Building AMI for version $(T_YELLOW)$(kubernetes_version)$(T_GREEN) on $(T_YELLOW)$(arch)$(T_RESET)" - $(PACKER_BINARY) build -timestamp-ui -color=false $(PACKER_VAR_FLAGS) eks-worker-al2.json + $(PACKER_BINARY) build -timestamp-ui -color=false $(PACKER_VAR_FLAGS) $(PACKER_TEMPLATE_FILE) # Build dates and versions taken from https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html -.PHONY: 1.22 -1.22: ## Build EKS Optimized AL2 AMI - K8s 1.22 - $(MAKE) k8s kubernetes_version=1.22.17 kubernetes_build_date=2023-03-17 pull_cni_from_github=true - .PHONY: 1.23 1.23: ## Build EKS Optimized AL2 AMI - K8s 1.23 - $(MAKE) k8s kubernetes_version=1.23.17 kubernetes_build_date=2023-03-17 pull_cni_from_github=true + $(MAKE) k8s kubernetes_version=1.23.17 kubernetes_build_date=2023-05-11 pull_cni_from_github=true .PHONY: 1.24 1.24: ## Build EKS Optimized AL2 AMI - K8s 1.24 - $(MAKE) k8s kubernetes_version=1.24.11 kubernetes_build_date=2023-03-17 pull_cni_from_github=true + $(MAKE) k8s kubernetes_version=1.24.13 kubernetes_build_date=2023-05-11 pull_cni_from_github=true .PHONY: 1.25 1.25: ## Build EKS Optimized AL2 AMI - K8s 1.25 - $(MAKE) k8s kubernetes_version=1.25.7 kubernetes_build_date=2023-03-17 pull_cni_from_github=true + $(MAKE) k8s kubernetes_version=1.25.9 kubernetes_build_date=2023-05-11 pull_cni_from_github=true .PHONY: 1.26 1.26: ## Build EKS Optimized AL2 AMI - K8s 1.26 - $(MAKE) k8s kubernetes_version=1.26.2 kubernetes_build_date=2023-03-17 pull_cni_from_github=true + $(MAKE) k8s kubernetes_version=1.26.4 kubernetes_build_date=2023-05-11 pull_cni_from_github=true +.PHONY: 1.27 +1.27: ## Build EKS Optimized AL2 AMI - K8s 1.27 + $(MAKE) k8s kubernetes_version=1.27.1 kubernetes_build_date=2023-04-19 pull_cni_from_github=true + .PHONY: clean clean: rm *-manifest.json diff --git a/doc/USER_GUIDE.md b/doc/USER_GUIDE.md index 4e7291138..c8f79a5bf 100644 --- a/doc/USER_GUIDE.md +++ b/doc/USER_GUIDE.md @@ -11,6 +11,8 @@ This document includes details about using the AMI template and the resulting AM 1. [AL2 and Linux kernel information](#al2-and-linux-kernel-information) 1. [Updating known instance types](#updating-known-instance-types) 1. [Version-locked packages](#version-locked-packages) +1. [Image credential provider plugins](#image-credential-provider-plugins) +1. [Ephemeral Storage](#ephemeral-storage) --- @@ -21,9 +23,58 @@ Default values for most variables are defined in [a default variable file](eks-w Users have the following options for specifying their own values: 1. Provide a variable file with the `PACKER_VARIABLE_FILE` argument to `make`. Values in this file will override values in the default variable file. Your variable file does not need to include all possible variables, as it will be merged with the default variable file. -2. Pass a key-value pair for any template variable to `make`. These values will override any values that were specified with the first method. +2. Pass a key-value pair for any template variable to `make`. These values will override any values that were specified with the first method. In the table below, these variables have a default value of "None". -**Note** that some variables (such as `arch` and `kubernetes_version`) do not have a sensible, static default, and are satisfied by the Makefile. Such variables do not appear in the default variable file, and must be overridden (if necessary) by the second method described above. +> **Note** +> Some variables (such as `arch` and `kubernetes_version`) do not have a sensible, static default, and are satisfied by the Makefile. +> Such variables do not appear in the default variable file, and must be overridden (if necessary) by the second method described above. + + + +| Variable | Default value | Description | +| - | - | - | +| `additional_yum_repos` | `""` | | +| `ami_component_description` | ```{{user `remote_folder`}}/worker``` | | +| `ami_description` | ```{{user `remote_folder`}}/worker``` | | +| `ami_name` | None | | +| `ami_regions` | `""` | | +| `ami_users` | `""` | | +| `arch` | None | | +| `associate_public_ip_address` | `""` | | +| `aws_access_key_id` | ```{{user `remote_folder`}}/worker``` | | +| `aws_region` | ```{{user `remote_folder`}}/worker``` | | +| `aws_secret_access_key` | ```{{user `remote_folder`}}/worker``` | | +| `aws_session_token` | ```{{user `remote_folder`}}/worker``` | | +| `binary_bucket_name` | ```{{user `remote_folder`}}/worker``` | | +| `binary_bucket_region` | ```{{user `remote_folder`}}/worker``` | | +| `cache_container_images` | ```{{user `remote_folder`}}/worker``` | | +| `cni_plugin_version` | ```{{user `remote_folder`}}/worker``` | | +| `containerd_version` | ```{{user `remote_folder`}}/worker``` | | +| `creator` | ```{{user `remote_folder`}}/worker``` | | +| `docker_version` | ```{{user `remote_folder`}}/worker``` | | +| `encrypted` | ```{{user `remote_folder`}}/worker``` | | +| `instance_type` | None | | +| `kernel_version` | `""` | | +| `kms_key_id` | `""` | | +| `kubernetes_build_date` | None | | +| `kubernetes_version` | None | | +| `launch_block_device_mappings_volume_size` | ```{{user `remote_folder`}}/worker``` | | +| `pause_container_version` | ```{{user `remote_folder`}}/worker``` | | +| `pull_cni_from_github` | ```{{user `remote_folder`}}/worker``` | | +| `remote_folder` | ```{{user `remote_folder`}}/worker``` | Directory path for shell provisioner scripts on the builder instance | +| `runc_version` | ```{{user `remote_folder`}}/worker``` | | +| `security_group_id` | `""` | | +| `sonobuoy_e2e_registry` | `""` | | +| `source_ami_filter_name` | ```{{user `remote_folder`}}/worker``` | | +| `source_ami_id` | `""` | | +| `source_ami_owners` | ```{{user `remote_folder`}}/worker``` | | +| `ssh_interface` | `""` | | +| `ssh_username` | ```{{user `remote_folder`}}/worker``` | | +| `subnet_id` | `""` | | +| `temporary_security_group_source_cidrs` | `""` | | +| `volume_type` | ```{{user `remote_folder`}}/worker``` | | +| `working_dir` | ```{{user `remote_folder`}}/worker``` | Directory path for ephemeral resources on the builder instance | + --- @@ -309,3 +360,32 @@ sudo yum versionlock delete $PACKAGE_NAME # unlock all packages sudo yum versionlock clear ``` + +--- + +## Image credential provider plugins + +Prior to Kubernetes 1.27, the `kubelet` could obtain credentials for ECR out of the box. This legacy credential process has been removed in Kubernetes 1.27, and +ECR credentials should now be obtained via a plugin, the `ecr-credential-provider`. This plugin is installed in the AMI at `/etc/eks/image-credential-provider/ecr-credential-provider`. More information about this plugin is available in the [`cloud-provider-aws` documentation](https://cloud-provider-aws.sigs.k8s.io/credential_provider/). + +Additional image credential provider plugins may be appended to `/etc/eks/image-credential-provider/config.json`. In Kubernetes versions 1.26 and below, all plugins in this file must support `credentialprovider.kubelet.k8s.io/v1alpha1`. In Kubernetes versions 1.27 and above, they must support `credentialprovider.kubelet.k8s.io/v1`. + +For more information about image credential provider plugins, refer to the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubelet-credential-provider/). + +--- + +## Ephemeral Storage + +Some instance types launch with ephemeral NVMe instance storage (i3, i4i, c5d, c6id, etc). There are two main ways of utilizing this storage within Kubernetes: a single RAID-0 array for use by kubelet and containerd or mounting the individual disks for pod usage. + +The EKS Optimized AMI includes a utility script to configure ephemeral storage. The script can be invoked by passing the `--local-disks ` flag to the `/etc/eks/bootstrap.sh` script or the script can be invoked directly at `/bin/setup-local-disks`. All disks are formatted with an XFS file system. + +Below are details on the two disk setup options: + +### RAID-0 for Kubelet and Containerd (raid0) + +A RAID-0 array is setup that includes all ephemeral NVMe instance storage disks. The containerd and kubelet state directories (`/var/lib/containerd` and `/var/lib/kubelet`) will then use the ephemeral storage for more and faster node ephemeral-storage. The node's ephemeral storage can be shared among pods that request ephemeral storage and container images that are downloaded to the node. + +### Mount for Persistent Volumes (mount) + +Another way of utilizing the ephemeral disks is to format and mount the individual disks. Mounting individual disks allows the [local-static-provisioner](https://github.com/kubernetes-sigs/sig-storage-local-static-provisioner) DaemonSet to create Persistent Volume Claims that pods can utilize. diff --git a/eks-worker-al2-variables.json b/eks-worker-al2-variables.json index 10b87092f..6faf232e6 100644 --- a/eks-worker-al2-variables.json +++ b/eks-worker-al2-variables.json @@ -11,19 +11,19 @@ "aws_session_token": "{{env `AWS_SESSION_TOKEN`}}", "binary_bucket_name": "amazon-eks", "binary_bucket_region": "us-west-2", - "cache_container_images": "false", + "cache_container_images": "false", "cni_plugin_version": "v0.8.6", - "containerd_version": "1.6.6-1.amzn2.0.2", + "containerd_version": "1.6.*", "creator": "{{env `USER`}}", - "docker_version": "20.10.17-1.amzn2.0.1", + "docker_version": "20.10.23-1.amzn2.0.1", "encrypted": "false", "kernel_version": "", "kms_key_id": "", "launch_block_device_mappings_volume_size": "4", - "pause_container_version": "3.5", + "pause_container_version": "3.5", "pull_cni_from_github": "true", - "remote_folder": "", - "runc_version": "1.1.4-1.amzn2", + "remote_folder": "/tmp", + "runc_version": "1.1.5-1.amzn2", "security_group_id": "", "source_ami_filter_name": "amzn2-ami-minimal-hvm-*", "source_ami_id": "", @@ -32,5 +32,6 @@ "ssh_username": "ec2-user", "subnet_id": "", "temporary_security_group_source_cidrs": "", - "volume_type": "gp2" + "volume_type": "gp2", + "working_dir": "{{user `remote_folder`}}/worker" } diff --git a/eks-worker-al2.json b/eks-worker-al2.json index aae32c09d..51d20fbf9 100644 --- a/eks-worker-al2.json +++ b/eks-worker-al2.json @@ -39,7 +39,8 @@ "ssh_username": null, "subnet_id": null, "temporary_security_group_source_cidrs": null, - "volume_type": null + "volume_type": null, + "working_dir": null }, "builders": [ { @@ -113,34 +114,34 @@ "provisioners": [ { "type": "shell", - "remote_folder": "{{ user `remote_folder`}}", - "script": "{{template_dir}}/scripts/install_additional_repos.sh", - "environment_vars": [ - "ADDITIONAL_YUM_REPOS={{user `additional_yum_repos`}}" + "inline": [ + "mkdir -p {{user `working_dir`}}", + "mkdir -p {{user `working_dir`}}/log-collector-script" ] }, { "type": "shell", "remote_folder": "{{ user `remote_folder`}}", - "inline": [ - "mkdir -p /tmp/worker/log-collector-script/" + "script": "{{template_dir}}/scripts/install_additional_repos.sh", + "environment_vars": [ + "ADDITIONAL_YUM_REPOS={{user `additional_yum_repos`}}" ] }, { "type": "file", "source": "{{template_dir}}/files/", - "destination": "/tmp/worker/" + "destination": "{{user `working_dir`}}" }, { "type": "file", "source": "{{template_dir}}/log-collector-script/linux/", - "destination": "/tmp/worker/log-collector-script/" + "destination": "{{user `working_dir`}}/log-collector-script/" }, { "type": "shell", "inline": [ - "sudo chmod -R a+x /tmp/worker/bin/", - "sudo mv /tmp/worker/bin/* /usr/bin/" + "sudo chmod -R a+x {{user `working_dir`}}/bin/", + "sudo mv {{user `working_dir`}}/bin/* /usr/bin/" ] }, { @@ -172,7 +173,8 @@ "AWS_SECRET_ACCESS_KEY={{user `aws_secret_access_key`}}", "AWS_SESSION_TOKEN={{user `aws_session_token`}}", "PAUSE_CONTAINER_VERSION={{user `pause_container_version`}}", - "CACHE_CONTAINER_IMAGES={{user `cache_container_images`}}" + "CACHE_CONTAINER_IMAGES={{user `cache_container_images`}}", + "WORKING_DIR={{user `working_dir`}}" ] }, { @@ -200,13 +202,22 @@ "type": "shell", "remote_folder": "{{ user `remote_folder`}}", "script": "{{template_dir}}/scripts/generate-version-info.sh", - "execute_command": "chmod +x {{ .Path }}; {{ .Path }} /tmp/version-info.json" + "execute_command": "chmod +x {{ .Path }}; {{ .Path }} {{user `working_dir`}}/version-info.json", + "environment_vars": [ + "CACHE_CONTAINER_IMAGES={{user `cache_container_images`}}" + ] }, { "type": "file", "direction": "download", - "source": "/tmp/version-info.json", + "source": "{{user `working_dir`}}/version-info.json", "destination": "{{ user `ami_name` }}-version-info.json" + }, + { + "type": "shell", + "inline": [ + "rm -rf {{user `working_dir`}}" + ] } ], "post-processors": [ diff --git a/files/bin/configure-clocksource b/files/bin/configure-clocksource new file mode 100755 index 000000000..9815401f8 --- /dev/null +++ b/files/bin/configure-clocksource @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +CLOCK_PATH="/sys/devices/system/clocksource/clocksource0" + +function log() { + echo >&2 "$@" +} + +function current-clocksource() { + cat "${CLOCK_PATH}/current_clocksource" +} + +function check-available-clocksource() { + grep --quiet "${1}" "${CLOCK_PATH}/available_clocksource" +} + +function try-set-clocksource() { + if check-available-clocksource "${1}"; then + echo "${1}" > "${CLOCK_PATH}/current_clocksource" + log "configured clocksource: ${1}" + else + log "clocksource not available: ${1}" + fi +} + +case "$(imds /latest/meta-data/system)" in + nitro) + CLOCKSOURCE="kvm-clock" + ;; + + **) + CLOCKSOURCE="tsc" + ;; +esac + +log "desired clocksource: ${CLOCKSOURCE}" + +if [ ! "$(current-clocksource)" = "${CLOCKSOURCE}" ]; then + try-set-clocksource "${CLOCKSOURCE}" +fi + +log "final clocksource: $(current-clocksource)" diff --git a/files/bin/imds b/files/bin/imds index 7619ee3fb..2d23801ba 100755 --- a/files/bin/imds +++ b/files/bin/imds @@ -50,7 +50,7 @@ function imdscurl() { function get-token() { local TOKEN_DIR=/tmp/imds-tokens - mkdir -p $TOKEN_DIR + mkdir -p -m a+wrx $TOKEN_DIR # cleanup expired tokens local DELETED_TOKENS=0 diff --git a/files/bin/setup-local-disks b/files/bin/setup-local-disks new file mode 100644 index 000000000..9cdb18dae --- /dev/null +++ b/files/bin/setup-local-disks @@ -0,0 +1,220 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +err_report() { + echo "Exited with error on line $1" +} +trap 'err_report $LINENO' ERR + +print_help() { + echo "usage: $0 " + echo "Sets up Amazon EC2 Instance Store NVMe disks" + echo "" + echo "-d, --dir directory to mount the filesystem(s) (default: /mnt/k8s-disks/)" + echo "-h, --help print this help" +} + +# Sets up a RAID-0 of NVMe instance storage disks, moves +# the contents of /var/lib/kubelet and /var/lib/containerd +# to the new mounted RAID, and bind mounts the kubelet and +# containerd state directories. +maybe_raid0() { + local md_name="kubernetes" + local md_device="/dev/md/${md_name}" + local md_config="/.aws/mdadm.conf" + local array_mount_point="${MNT_DIR}/0" + mkdir -p "$(dirname "${md_config}")" + + if [[ ! -s "${md_config}" ]]; then + mdadm --create --force --verbose \ + "${md_device}" \ + --level=0 \ + --name="${md_name}" \ + --raid-devices="${#EPHEMERAL_DISKS[@]}" \ + "${EPHEMERAL_DISKS[@]}" + while [ -n "$(mdadm --detail "${md_device}" | grep -ioE 'State :.*resyncing')" ]; do + echo "Raid is resyncing..." + sleep 1 + done + mdadm --detail --scan > "${md_config}" + fi + + ## Check if the device symlink has changed on reboot to include a homehost identifier + local current_md_device=$(find /dev/md/ -type l -regex ".*/${md_name}_?[0-9a-z]*$" | tail -n1) + if [[ ! -z ${current_md_device} ]]; then + md_device="${current_md_device}" + fi + + # Format the array if not already formatted. + if [[ -z "$(lsblk "${md_device}" -o fstype --noheadings)" ]]; then + ## By default, mkfs tries to use the stripe unit of the array (512k), + ## for the log stripe unit, but the max log stripe unit is 256k. + ## So instead, we use 32k (8 blocks) to avoid a warning of breaching the max. + ## mkfs.xfs defaults to 32k after logging the warning since the default log buffer size is 32k. + mkfs.xfs -l su=8b "${md_device}" + fi + + ## Create the mount directory + mkdir -p "${array_mount_point}" + + local dev_uuid=$(blkid -s UUID -o value "${md_device}") + local mount_unit_name="$(systemd-escape --path --suffix=mount "${array_mount_point}")" + cat > "/etc/systemd/system/${mount_unit_name}" << EOF + [Unit] + Description=Mount EC2 Instance Store NVMe disk RAID0 + [Mount] + What=UUID=${dev_uuid} + Where=${array_mount_point} + Type=xfs + Options=defaults,noatime + [Install] + WantedBy=multi-user.target +EOF + systemd-analyze verify "${mount_unit_name}" + systemctl enable "${mount_unit_name}" --now + + prev_running="" + needs_linked="" + for unit in "kubelet" "containerd"; do + ## Check if the bind mount from the RAID already exists + if [[ "$(systemctl is-active var-lib-${unit}.mount)" != "active" ]]; then + # Check if components that depend on the RAID are running and, if so, stop them + if systemctl is-active "${unit}" > /dev/null 2>&1; then + prev_running+=" ${unit}" + fi + needs_linked+=" /var/lib/${unit}" + fi + done + + ## Check if /var/log/pods has been bind mounted and make sure kubelet is stopped + if [[ "$(systemctl is-active var-log-pods.mount)" != "active" ]]; then + if systemctl is-active "kubelet" > /dev/null 2>&1; then + prev_running+=" ${unit}" + fi + needs_linked+=" /var/log/pods" + fi + + if [[ ! -z "${prev_running}" ]]; then + systemctl stop ${prev_running} + fi + + # Transfer state directories to the array, if they exist. + for mount_point in ${needs_linked}; do + local unit="$(basename "${mount_point}")" + local array_mount_point_unit="${array_mount_point}/${unit}" + mkdir -p "${mount_point}" + echo "Copying ${mount_point}/ to ${array_mount_point_unit}/" + cp -a "${mount_point}/" "${array_mount_point_unit}/" + local mount_unit_name="$(systemd-escape --path --suffix=mount "${mount_point}")" + cat > "/etc/systemd/system/${mount_unit_name}" << EOF + [Unit] + Description=Mount ${unit} on EC2 Instance Store NVMe RAID0 + [Mount] + What=${array_mount_point_unit} + Where=${mount_point} + Type=none + Options=bind + [Install] + WantedBy=multi-user.target +EOF + systemd-analyze verify "${mount_unit_name}" + systemctl enable "${mount_unit_name}" --now + done + + if [[ ! -z "${prev_running}" ]]; then + systemctl start ${prev_running} + fi +} + +# Mounts and creates xfs file systems on all EC2 instance store NVMe disks +# without existing file systems. Mounts in /mnt/k8s-disks/{1..} by default +maybe_mount() { + idx=1 + for dev in "${EPHEMERAL_DISKS[@]}"; do + if [[ -z "$(lsblk "${dev}" -o fstype --noheadings)" ]]; then + mkfs.xfs -l su=8b "${dev}" + fi + if [[ ! -z "$(lsblk "${dev}" -o MOUNTPOINT --noheadings)" ]]; then + echo "${dev} is already mounted." + continue + fi + local mount_point="${MNT_DIR}/${idx}" + local mount_unit_name="$(systemd-escape --path --suffix=mount "${mount_point}")" + mkdir -p "${mount_point}" + cat > "/etc/systemd/system/${mount_unit_name}" << EOF + [Unit] + Description=Mount EC2 Instance Store NVMe disk ${idx} + [Mount] + What=${dev} + Where=${mount_point} + Type=xfs + Options=defaults,noatime + [Install] + WantedBy=multi-user.target +EOF + systemd-analyze verify "${mount_unit_name}" + systemctl enable "${mount_unit_name}" --now + idx=$((idx + 1)) + done +} + +## Main logic +MNT_DIR="/mnt/k8s-disks" + +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -h | --help) + print_help + exit 0 + ;; + -d | --dir) + MNT_DIR="$2" + shift + shift + ;; + *) # unknown option + POSITIONAL+=("$1") # save it in an array for later + shift # past argument + ;; + esac +done + +set +u +set -- "${POSITIONAL[@]}" # restore positional parameters +DISK_SETUP="$1" +set -u + +if [[ "${DISK_SETUP}" != "raid0" && "${DISK_SETUP}" != "mount" ]]; then + echo "Valid disk setup options are: raid0 or mount" + exit 1 +fi + +disks=($(find -L /dev/disk/by-id/ -xtype l -name '*NVMe_Instance_Storage_*')) +## Bail early if there are no ephemeral disks to setup +if [[ "${#disks[@]}" -eq 0 ]]; then + echo "no ephemeral disks found, skipping disk setup" + exit 0 +fi + +if [ "$(id --user)" -ne 0 ]; then + echo "Must be run as root" + exit 1 +fi + +## Get devices of NVMe instance storage ephemeral disks +EPHEMERAL_DISKS=($(realpath "${disks[@]}" | sort -u)) + +case "${DISK_SETUP}" in + "raid0") + maybe_raid0 + echo "Successfully setup RAID-0 consisting of ${EPHEMERAL_DISKS[@]}" + ;; + "mount") + maybe_mount + echo "Successfully setup disk mounts consisting of ${EPHEMERAL_DISKS[@]}" + ;; +esac diff --git a/files/bin/vercmp b/files/bin/vercmp index 8edf7b920..5bb467854 100755 --- a/files/bin/vercmp +++ b/files/bin/vercmp @@ -81,7 +81,10 @@ case $OPERATOR in ;; esac -echo "$OUTCOME" +VERCMP_QUIET="${VERCMP_QUIET:-false}" +if [ ! "$VERCMP_QUIET" = "true" ]; then + echo "$OUTCOME" +fi if [ "$OUTCOME" = "true" ]; then exit 0 diff --git a/files/bootstrap.sh b/files/bootstrap.sh index ce1a7f0e5..8937784bb 100755 --- a/files/bootstrap.sh +++ b/files/bootstrap.sh @@ -11,6 +11,9 @@ trap 'err_report $LINENO' ERR IFS=$'\n\t' +# mute stdout from vercmp +export VERCMP_QUIET=true + function print_help { echo "usage: $0 [options] " echo "Bootstraps an instance into an EKS cluster" @@ -21,7 +24,7 @@ function print_help { echo "--aws-api-retry-attempts Number of retry attempts for AWS API call (DescribeCluster) (default: 3)" echo "--b64-cluster-ca The base64 encoded cluster CA content. Only valid when used with --apiserver-endpoint. Bypasses calling \"aws eks describe-cluster\"" echo "--cluster-id Specify the id of EKS cluster" - echo "--container-runtime Specify a container runtime (default: dockerd)" + echo "--container-runtime Specify a container runtime. For Kubernetes 1.23 and below, possible values are [dockerd, containerd] and the default value is dockerd. For Kubernetes 1.24 and above, containerd is the only valid value. This flag is deprecated and will be removed in a future release." echo "--containerd-config-file File containing the containerd configuration to be used in place of AMI defaults." echo "--dns-cluster-ip Overrides the IP address to use for DNS queries within the cluster. Defaults to 10.100.0.10 or 172.20.0.10 based on the IP address of the primary interface" echo "--docker-config-json The contents of the /etc/docker/daemon.json file. Useful if you want a custom config differing from the default one in the AMI" @@ -29,13 +32,20 @@ function print_help { echo "--enable-local-outpost Enable support for worker nodes to communicate with the local control plane when running on a disconnected Outpost. (true or false)" echo "--ip-family Specify ip family of the cluster" echo "--kubelet-extra-args Extra arguments to add to the kubelet. Useful for adding labels or taints." - echo "--mount-bfs-fs Mount a bpffs at /sys/fs/bpf (default: true, for Kubernetes 1.27+; false otherwise)" + echo "--local-disks Setup instance storage NVMe disks in raid0 or mount the individual disks for use by pods [mount | raid0]" + echo "--mount-bpf-fs Mount a bpffs at /sys/fs/bpf (default: true, for Kubernetes 1.25+; false otherwise)" echo "--pause-container-account The AWS account (number) to pull the pause container from" echo "--pause-container-version The tag of the pause container" echo "--service-ipv6-cidr ipv6 cidr range of the cluster" echo "--use-max-pods Sets --max-pods for the kubelet when true. (default: true)" } +function log { + echo >&2 "$(date '+%Y-%m-%dT%H:%M:%S%z')" "[eks-bootstrap]" "$@" +} + +log "INFO: starting..." + POSITIONAL=() while [[ $# -gt 0 ]]; do @@ -47,86 +57,109 @@ while [[ $# -gt 0 ]]; do ;; --use-max-pods) USE_MAX_PODS="$2" + log "INFO: --use-max-pods='${USE_MAX_PODS}'" shift shift ;; --b64-cluster-ca) B64_CLUSTER_CA=$2 + log "INFO: --b64-cluster-ca='${B64_CLUSTER_CA}'" shift shift ;; --apiserver-endpoint) APISERVER_ENDPOINT=$2 + log "INFO: --apiserver-endpoint='${APISERVER_ENDPOINT}'" shift shift ;; --kubelet-extra-args) KUBELET_EXTRA_ARGS=$2 + log "INFO: --kubelet-extra-args='${KUBELET_EXTRA_ARGS}'" shift shift ;; --enable-docker-bridge) ENABLE_DOCKER_BRIDGE=$2 + log "INFO: --enable-docker-bridge='${ENABLE_DOCKER_BRIDGE}'" shift shift ;; --aws-api-retry-attempts) API_RETRY_ATTEMPTS=$2 + log "INFO: --aws-api-retry-attempts='${API_RETRY_ATTEMPTS}'" shift shift ;; --docker-config-json) DOCKER_CONFIG_JSON=$2 + log "INFO: --docker-config-json='${DOCKER_CONFIG_JSON}'" shift shift ;; --containerd-config-file) CONTAINERD_CONFIG_FILE=$2 + log "INFO: --containerd-config-file='${CONTAINERD_CONFIG_FILE}'" shift shift ;; --pause-container-account) PAUSE_CONTAINER_ACCOUNT=$2 + log "INFO: --pause-container-account='${PAUSE_CONTAINER_ACCOUNT}'" shift shift ;; --pause-container-version) PAUSE_CONTAINER_VERSION=$2 + log "INFO: --pause-container-version='${PAUSE_CONTAINER_VERSION}'" shift shift ;; --dns-cluster-ip) DNS_CLUSTER_IP=$2 + log "INFO: --dns-cluster-ip='${DNS_CLUSTER_IP}'" shift shift ;; --container-runtime) CONTAINER_RUNTIME=$2 + log "INFO: --container-runtime='${CONTAINER_RUNTIME}'" shift shift ;; --ip-family) IP_FAMILY=$2 + log "INFO: --ip-family='${IP_FAMILY}'" shift shift ;; --service-ipv6-cidr) SERVICE_IPV6_CIDR=$2 + log "INFO: --service-ipv6-cidr='${SERVICE_IPV6_CIDR}'" shift shift ;; --enable-local-outpost) ENABLE_LOCAL_OUTPOST=$2 + log "INFO: --enable-local-outpost='${ENABLE_LOCAL_OUTPOST}'" shift shift ;; --cluster-id) CLUSTER_ID=$2 + log "INFO: --cluster-id='${CLUSTER_ID}'" shift shift ;; --mount-bpf-fs) MOUNT_BPF_FS=$2 + log "INFO: --mount-bpf-fs='${MOUNT_BPF_FS}'" + shift + shift + ;; + --local-disks) + LOCAL_DISKS=$2 + log "INFO: --local-disks='${LOCAL_DISKS}'" shift shift ;; @@ -143,31 +176,32 @@ CLUSTER_NAME="$1" set -u KUBELET_VERSION=$(kubelet --version | grep -Eo '[0-9]\.[0-9]+\.[0-9]+') -echo "Using kubelet version $KUBELET_VERSION" +log "INFO: Using kubelet version $KUBELET_VERSION" + +# ecr-credential-provider only implements credentialprovider.kubelet.k8s.io/v1alpha1 prior to 1.27.1: https://github.com/kubernetes/cloud-provider-aws/pull/597 +# TODO: remove this when 1.26 is EOL +if vercmp "$KUBELET_VERSION" lt "1.27.0"; then + IMAGE_CREDENTIAL_PROVIDER_CONFIG=/etc/eks/image-credential-provider/config.json + echo "$(jq '.apiVersion = "kubelet.config.k8s.io/v1alpha1"' $IMAGE_CREDENTIAL_PROVIDER_CONFIG)" > $IMAGE_CREDENTIAL_PROVIDER_CONFIG + echo "$(jq '.providers[].apiVersion = "credentialprovider.kubelet.k8s.io/v1alpha1"' $IMAGE_CREDENTIAL_PROVIDER_CONFIG)" > $IMAGE_CREDENTIAL_PROVIDER_CONFIG +fi + +# Set container runtime related variables +DOCKER_CONFIG_JSON="${DOCKER_CONFIG_JSON:-}" +ENABLE_DOCKER_BRIDGE="${ENABLE_DOCKER_BRIDGE:-false}" # As of Kubernetes version 1.24, we will start defaulting the container runtime to containerd # and no longer support docker as a container runtime. -IS_124_OR_GREATER=false DEFAULT_CONTAINER_RUNTIME=dockerd if vercmp "$KUBELET_VERSION" gteq "1.24.0"; then - IS_124_OR_GREATER=true DEFAULT_CONTAINER_RUNTIME=containerd -elif vercmp "$KUBELET_VERSION" gteq "1.22.0"; then - # These APIs are only available in alpha pre-1.24. - # This can be removed when version 1.23 is no longer supported. - sed -i s,kubelet.config.k8s.io/v1beta1,kubelet.config.k8s.io/v1alpha1,g /etc/eks/ecr-credential-provider/ecr-credential-provider-config - sed -i s,credentialprovider.kubelet.k8s.io/v1beta1,credentialprovider.kubelet.k8s.io/v1alpha1,g /etc/eks/ecr-credential-provider/ecr-credential-provider-config fi - -# Set container runtime related variables -DOCKER_CONFIG_JSON="${DOCKER_CONFIG_JSON:-}" -ENABLE_DOCKER_BRIDGE="${ENABLE_DOCKER_BRIDGE:-false}" CONTAINER_RUNTIME="${CONTAINER_RUNTIME:-$DEFAULT_CONTAINER_RUNTIME}" -echo "Using $CONTAINER_RUNTIME as the container runtime" +log "INFO: Using $CONTAINER_RUNTIME as the container runtime" -if $IS_124_OR_GREATER && [ $CONTAINER_RUNTIME != "containerd" ]; then - echo "ERROR: containerd is the only supported container runtime as of Kubernetes version 1.24" +if vercmp "$KUBELET_VERSION" gteq "1.24.0" && [ $CONTAINER_RUNTIME != "containerd" ]; then + log "ERROR: containerd is the only supported container runtime as of Kubernetes version 1.24" exit 1 fi @@ -184,9 +218,14 @@ IP_FAMILY="${IP_FAMILY:-}" SERVICE_IPV6_CIDR="${SERVICE_IPV6_CIDR:-}" ENABLE_LOCAL_OUTPOST="${ENABLE_LOCAL_OUTPOST:-}" CLUSTER_ID="${CLUSTER_ID:-}" +LOCAL_DISKS="${LOCAL_DISKS:-}" + +if [[ ! -z ${LOCAL_DISKS} ]]; then + setup-local-disks "${LOCAL_DISKS}" +fi DEFAULT_MOUNT_BPF_FS="true" -if vercmp "$KUBELET_VERSION" lt "1.27.0"; then +if vercmp "$KUBELET_VERSION" lt "1.25.0"; then DEFAULT_MOUNT_BPF_FS="false" fi MOUNT_BPF_FS="${MOUNT_BPF_FS:-$DEFAULT_MOUNT_BPF_FS}" @@ -253,21 +292,21 @@ get_cpu_millicores_to_reserve() { } if [ -z "$CLUSTER_NAME" ]; then - echo "CLUSTER_NAME is not defined" + log "ERROR: cluster name is not defined!" exit 1 fi if [[ ! -z "${IP_FAMILY}" ]]; then IP_FAMILY="$(tr [A-Z] [a-z] <<< "$IP_FAMILY")" if [[ "${IP_FAMILY}" != "ipv4" ]] && [[ "${IP_FAMILY}" != "ipv6" ]]; then - echo "Invalid IpFamily. Only ipv4 or ipv6 are allowed" + log "ERROR: Invalid --ip-family. Only ipv4 or ipv6 are allowed" exit 1 fi fi if [[ ! -z "${SERVICE_IPV6_CIDR}" ]]; then if [[ "${IP_FAMILY}" == "ipv4" ]]; then - echo "ip-family should be ipv6 when service-ipv6-cidr is specified" + log "ERROR: --ip-family should be ipv6 when --service-ipv6-cidr is specified" exit 1 fi IP_FAMILY="ipv6" @@ -278,14 +317,19 @@ AWS_SERVICES_DOMAIN=$(imds 'latest/meta-data/services/domain') MACHINE=$(uname -m) if [[ "$MACHINE" != "x86_64" && "$MACHINE" != "aarch64" ]]; then - echo "Unknown machine architecture '$MACHINE'" >&2 + log "ERROR: Unknown machine architecture: '$MACHINE'" exit 1 fi if [ "$MOUNT_BPF_FS" = "true" ]; then - sudo mount-bpf-fs + mount-bpf-fs fi +cp -v /etc/eks/configure-clocksource.service /etc/systemd/system/configure-clocksource.service +chown root:root /etc/systemd/system/configure-clocksource.service +systemctl daemon-reload +systemctl enable --now configure-clocksource + ECR_URI=$(/etc/eks/get-ecr-uri.sh "${AWS_DEFAULT_REGION}" "${AWS_SERVICES_DOMAIN}" "${PAUSE_CONTAINER_ACCOUNT:-}") PAUSE_CONTAINER_IMAGE=${PAUSE_CONTAINER_IMAGE:-$ECR_URI/eks/pause} PAUSE_CONTAINER="$PAUSE_CONTAINER_IMAGE:$PAUSE_CONTAINER_VERSION" @@ -296,13 +340,14 @@ CA_CERTIFICATE_DIRECTORY=/etc/kubernetes/pki CA_CERTIFICATE_FILE_PATH=$CA_CERTIFICATE_DIRECTORY/ca.crt mkdir -p $CA_CERTIFICATE_DIRECTORY if [[ -z "${B64_CLUSTER_CA}" ]] || [[ -z "${APISERVER_ENDPOINT}" ]]; then + log "INFO: --cluster-ca or --api-server-endpoint is not defined, describing cluster..." DESCRIBE_CLUSTER_RESULT="/tmp/describe_cluster_result.txt" # Retry the DescribeCluster API for API_RETRY_ATTEMPTS for attempt in $(seq 0 $API_RETRY_ATTEMPTS); do rc=0 if [[ $attempt -gt 0 ]]; then - echo "Attempt $attempt of $API_RETRY_ATTEMPTS" + log "INFO: Attempt $attempt of $API_RETRY_ATTEMPTS" fi aws eks wait cluster-active \ @@ -318,6 +363,7 @@ if [[ -z "${B64_CLUSTER_CA}" ]] || [[ -z "${APISERVER_ENDPOINT}" ]]; then break fi if [[ $attempt -eq $API_RETRY_ATTEMPTS ]]; then + log "ERROR: Exhausted retries while describing cluster!" exit $rc fi jitter=$((1 + RANDOM % 10)) @@ -354,6 +400,8 @@ if [[ -z "${IP_FAMILY}" ]] || [[ "${IP_FAMILY}" == "None" ]]; then IP_FAMILY="ipv4" fi +log "INFO: Using IP family: ${IP_FAMILY}" + echo $B64_CLUSTER_CA | base64 -d > $CA_CERTIFICATE_FILE_PATH sed -i s,MASTER_ENDPOINT,$APISERVER_ENDPOINT,g /var/lib/kubelet/kubeconfig @@ -384,7 +432,7 @@ if [[ "${ENABLE_LOCAL_OUTPOST}" == "true" ]]; then ### - if "aws eks describe-cluster" is bypassed, for local outpost, the value of CLUSTER_NAME parameter will be cluster id. ### - otherwise, the cluster id will use the id returned by "aws eks describe-cluster". if [[ -z "${CLUSTER_ID}" ]]; then - echo "Cluster ID is required when local outpost support is enabled" + log "ERROR: Cluster ID is required when local outpost support is enabled" exit 1 else sed -i s,CLUSTER_NAME,$CLUSTER_ID,g /var/lib/kubelet/kubeconfig @@ -399,12 +447,12 @@ fi ### kubelet.service configuration -MAC=$(imds 'latest/meta-data/network/interfaces/macs/' | head -n 1 | sed 's/\/$//') +MAC=$(imds 'latest/meta-data/mac') if [[ -z "${DNS_CLUSTER_IP}" ]]; then if [[ "${IP_FAMILY}" == "ipv6" ]]; then if [[ -z "${SERVICE_IPV6_CIDR}" ]]; then - echo "One of --service-ipv6-cidr or --dns-cluster-ip must be provided when ip-family is specified as ipv6" + log "ERROR: One of --service-ipv6-cidr or --dns-cluster-ip must be provided when --ip-family is ipv6" exit 1 fi DNS_CLUSTER_IP=$(awk -F/ '{print $1}' <<< $SERVICE_IPV6_CIDR)a @@ -454,7 +502,7 @@ set +o pipefail MAX_PODS=$(cat $MAX_PODS_FILE | awk "/^${INSTANCE_TYPE:-unset}/"' { print $2 }') set -o pipefail if [ -z "$MAX_PODS" ] || [ -z "$INSTANCE_TYPE" ]; then - echo "No entry for type '$INSTANCE_TYPE' in $MAX_PODS_FILE. Will attempt to auto-discover value." + log "INFO: No entry for type '$INSTANCE_TYPE' in $MAX_PODS_FILE. Will attempt to auto-discover value." # When determining the value of maxPods, we're using the legacy calculation by default since it's more restrictive than # the PrefixDelegation based alternative and is likely to be in-use by more customers. # The legacy numbers also maintain backwards compatibility when used to calculate `kubeReserved.memory` @@ -481,31 +529,29 @@ if vercmp "$KUBELET_VERSION" lt "1.26.0"; then else KUBELET_CLOUD_PROVIDER="external" echo "$(jq ".providerID=\"$(provider-id)\"" $KUBELET_CONFIG)" > $KUBELET_CONFIG + # When the external cloud provider is used, kubelet will use /etc/hostname as the name of the Node object. + # If the VPC has a custom `domain-name` in its DHCP options set, and the VPC has `enableDnsHostnames` set to `true`, + # then /etc/hostname is not the same as EC2's PrivateDnsName. + # The name of the Node object must be equal to EC2's PrivateDnsName for the aws-iam-authenticator to allow this kubelet to manage it. + INSTANCE_ID=$(imds /latest/meta-data/instance-id) + # the AWS CLI currently constructs the wrong endpoint URL on localzones (the availability zone group will be used instead of the parent region) + # more info: https://github.com/aws/aws-cli/issues/7043 + REGION=$(imds /latest/meta-data/placement/region) + PRIVATE_DNS_NAME=$(AWS_RETRY_MODE=standard AWS_MAX_ATTEMPTS=10 aws ec2 describe-instances --region $REGION --instance-ids $INSTANCE_ID --query 'Reservations[].Instances[].PrivateDnsName' --output text) + KUBELET_ARGS="$KUBELET_ARGS --hostname-override=$PRIVATE_DNS_NAME" fi KUBELET_ARGS="$KUBELET_ARGS --cloud-provider=$KUBELET_CLOUD_PROVIDER" -mkdir -p /etc/systemd/system/kubelet.service.d - -cat << EOF > /etc/systemd/system/kubelet.service.d/10-kubelet-args.conf -[Service] -Environment='KUBELET_ARGS=$KUBELET_ARGS' -EOF - -if [[ -n "$KUBELET_EXTRA_ARGS" ]]; then - cat << EOF > /etc/systemd/system/kubelet.service.d/30-kubelet-extra-args.conf -[Service] -Environment='KUBELET_EXTRA_ARGS=$KUBELET_EXTRA_ARGS' -EOF -fi +mkdir -p /etc/systemd/system if [[ "$CONTAINER_RUNTIME" = "containerd" ]]; then if $ENABLE_DOCKER_BRIDGE; then - echo "WARNING: Flag --enable-docker-bridge was set but will be ignored as it's not relevant to containerd" + log "WARNING: Flag --enable-docker-bridge was set but will be ignored as it's not relevant to containerd" fi if [ ! -z "$DOCKER_CONFIG_JSON" ]; then - echo "WARNING: Flag --docker-config-json was set but will be ignored as it's not relevant to containerd" + log "WARNING: Flag --docker-config-json was set but will be ignored as it's not relevant to containerd" fi sudo mkdir -p /etc/containerd @@ -538,6 +584,12 @@ if [[ "$CONTAINER_RUNTIME" = "containerd" ]]; then sudo chown root:root /etc/systemd/system/kubelet.service # Validate containerd config sudo containerd config dump > /dev/null + + # --container-runtime flag is gone in 1.27+ + # TODO: remove this when 1.26 is EOL + if vercmp "$KUBELET_VERSION" lt "1.27.0"; then + KUBELET_ARGS="$KUBELET_ARGS --container-runtime=remote" + fi elif [[ "$CONTAINER_RUNTIME" = "dockerd" ]]; then mkdir -p /etc/docker bash -c "/sbin/iptables-save > /etc/sysconfig/iptables" @@ -558,16 +610,31 @@ elif [[ "$CONTAINER_RUNTIME" = "dockerd" ]]; then systemctl enable docker systemctl restart docker else - echo "Container runtime ${CONTAINER_RUNTIME} is not supported." + log "ERROR: unsupported container runtime: '${CONTAINER_RUNTIME}'" exit 1 fi +mkdir -p /etc/systemd/system/kubelet.service.d + +cat << EOF > /etc/systemd/system/kubelet.service.d/10-kubelet-args.conf +[Service] +Environment='KUBELET_ARGS=$KUBELET_ARGS' +EOF + +if [[ -n "$KUBELET_EXTRA_ARGS" ]]; then + cat << EOF > /etc/systemd/system/kubelet.service.d/30-kubelet-extra-args.conf +[Service] +Environment='KUBELET_EXTRA_ARGS=$KUBELET_EXTRA_ARGS' +EOF +fi + +systemctl daemon-reload systemctl enable kubelet systemctl start kubelet # gpu boost clock if command -v nvidia-smi &> /dev/null; then - echo "nvidia-smi found" + log "INFO: nvidia-smi found" nvidia-smi -q > /tmp/nvidia-smi-check if [[ "$?" == "0" ]]; then @@ -575,7 +642,7 @@ if command -v nvidia-smi &> /dev/null; then sudo nvidia-smi --auto-boost-default=0 GPUNAME=$(nvidia-smi -L | head -n1) - echo $GPUNAME + log "INFO: GPU name: $GPUNAME" # set application clock to maximum if [[ $GPUNAME == *"A100"* ]]; then @@ -592,8 +659,9 @@ if command -v nvidia-smi &> /dev/null; then echo "unsupported gpu" fi else + log "ERROR: nvidia-smi check failed!" cat /tmp/nvidia-smi-check fi -else - echo "nvidia-smi not found" fi + +log "INFO: complete!" diff --git a/files/configure-clocksource.service b/files/configure-clocksource.service new file mode 100644 index 000000000..5274ca041 --- /dev/null +++ b/files/configure-clocksource.service @@ -0,0 +1,8 @@ +[Unit] +Description=Configure kernel clocksource + +[Service] +ExecStart=/usr/bin/configure-clocksource + +[Install] +WantedBy=multi-user.target diff --git a/files/ecr-credential-provider-config b/files/ecr-credential-provider-config deleted file mode 100644 index ae1f9d7a5..000000000 --- a/files/ecr-credential-provider-config +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: kubelet.config.k8s.io/v1beta1 -kind: CredentialProviderConfig -providers: - - name: ecr-credential-provider - matchImages: - - "*.dkr.ecr.*.amazonaws.com" - - "*.dkr.ecr.*.amazonaws.cn" - - "*.dkr.ecr-fips.*.amazonaws.com" - - "*.dkr.ecr.us-iso-east-1.c2s.ic.gov" - - "*.dkr.ecr.us-isob-east-1.sc2s.sgov.gov" - defaultCacheDuration: "12h" - apiVersion: credentialprovider.kubelet.k8s.io/v1beta1 - args: - - get-credentials diff --git a/files/ecr-credential-provider-config.json b/files/ecr-credential-provider-config.json new file mode 100644 index 000000000..6b251d69c --- /dev/null +++ b/files/ecr-credential-provider-config.json @@ -0,0 +1,18 @@ +{ + "apiVersion": "kubelet.config.k8s.io/v1", + "kind": "CredentialProviderConfig", + "providers": [ + { + "name": "ecr-credential-provider", + "matchImages": [ + "*.dkr.ecr.*.amazonaws.com", + "*.dkr.ecr.*.amazonaws.com.cn", + "*.dkr.ecr-fips.*.amazonaws.com", + "*.dkr.ecr.*.c2s.ic.gov", + "*.dkr.ecr.*.sc2s.sgov.gov" + ], + "defaultCacheDuration": "12h", + "apiVersion": "credentialprovider.kubelet.k8s.io/v1" + } + ] +} diff --git a/files/eni-max-pods.txt b/files/eni-max-pods.txt index cde3c610e..f82b87d9f 100644 --- a/files/eni-max-pods.txt +++ b/files/eni-max-pods.txt @@ -176,6 +176,14 @@ c7g.large 29 c7g.medium 8 c7g.metal 737 c7g.xlarge 58 +c7gn.12xlarge 234 +c7gn.16xlarge 737 +c7gn.2xlarge 58 +c7gn.4xlarge 234 +c7gn.8xlarge 234 +c7gn.large 29 +c7gn.medium 8 +c7gn.xlarge 58 cr1.8xlarge 234 d2.2xlarge 58 d2.4xlarge 234 @@ -233,6 +241,9 @@ h1.4xlarge 234 h1.8xlarge 234 hpc6a.48xlarge 100 hpc6id.32xlarge 51 +hpc7g.16xlarge 198 +hpc7g.4xlarge 198 +hpc7g.8xlarge 198 hs1.8xlarge 234 i2.2xlarge 58 i2.4xlarge 234 @@ -253,6 +264,12 @@ i3en.6xlarge 234 i3en.large 29 i3en.metal 737 i3en.xlarge 58 +i4g.16xlarge 737 +i4g.2xlarge 58 +i4g.4xlarge 234 +i4g.8xlarge 234 +i4g.large 29 +i4g.xlarge 58 i4i.16xlarge 737 i4i.2xlarge 58 i4i.32xlarge 737 @@ -271,6 +288,10 @@ inf1.24xlarge 321 inf1.2xlarge 38 inf1.6xlarge 234 inf1.xlarge 38 +inf2.24xlarge 737 +inf2.48xlarge 737 +inf2.8xlarge 234 +inf2.xlarge 58 is4gen.2xlarge 58 is4gen.4xlarge 234 is4gen.8xlarge 234 @@ -623,6 +644,7 @@ t4g.small 11 t4g.xlarge 58 trn1.2xlarge 58 trn1.32xlarge 247 +trn1n.32xlarge 247 u-12tb1.112xlarge 737 u-12tb1.metal 147 u-18tb1.112xlarge 737 diff --git a/files/get-ecr-uri.sh b/files/get-ecr-uri.sh index 134dc39cc..ba719ac06 100755 --- a/files/get-ecr-uri.sh +++ b/files/get-ecr-uri.sh @@ -27,6 +27,9 @@ else us-gov-east-1) acct="151742754352" ;; + us-iso-west-1) + acct="608367168043" + ;; us-iso-east-1) acct="725322719131" ;; diff --git a/files/kubelet-containerd.service b/files/kubelet-containerd.service index db1c56511..946fb1c28 100644 --- a/files/kubelet-containerd.service +++ b/files/kubelet-containerd.service @@ -10,10 +10,9 @@ ExecStartPre=/sbin/iptables -P FORWARD ACCEPT -w 5 ExecStart=/usr/bin/kubelet \ --config /etc/kubernetes/kubelet/kubelet-config.json \ --kubeconfig /var/lib/kubelet/kubeconfig \ - --container-runtime remote \ --container-runtime-endpoint unix:///run/containerd/containerd.sock \ - --image-credential-provider-config /etc/eks/ecr-credential-provider/ecr-credential-provider-config \ - --image-credential-provider-bin-dir /etc/eks/ecr-credential-provider \ + --image-credential-provider-config /etc/eks/image-credential-provider/config.json \ + --image-credential-provider-bin-dir /etc/eks/image-credential-provider \ $KUBELET_ARGS \ $KUBELET_EXTRA_ARGS diff --git a/files/kubelet.service b/files/kubelet.service index 5002876be..08c746504 100644 --- a/files/kubelet.service +++ b/files/kubelet.service @@ -11,8 +11,8 @@ ExecStart=/usr/bin/kubelet \ --kubeconfig /var/lib/kubelet/kubeconfig \ --container-runtime docker \ --network-plugin cni \ - --image-credential-provider-config /etc/eks/ecr-credential-provider/ecr-credential-provider-config \ - --image-credential-provider-bin-dir /etc/eks/ecr-credential-provider \ + --image-credential-provider-config /etc/eks/image-credential-provider/config.json \ + --image-credential-provider-bin-dir /etc/eks/image-credential-provider \ $KUBELET_ARGS \ $KUBELET_EXTRA_ARGS diff --git a/hack/generate-template-variable-doc.py b/hack/generate-template-variable-doc.py new file mode 100755 index 000000000..35cdde476 --- /dev/null +++ b/hack/generate-template-variable-doc.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import json +import os +import re + +whereami = os.path.abspath(__file__) +os.chdir(os.path.dirname(whereami)) + +template = {} +with open('../eks-worker-al2.json') as template_file: + template = json.load(template_file) + +default_vars = {} +with open('../eks-worker-al2-variables.json') as default_var_file: + default_vars = json.load(default_var_file) + +all_vars = {} + +for var in template['variables']: + all_vars[var] = None +for var, default_val in default_vars.items(): + all_vars[var] = default_val + +doc_file_name = '../doc/USER_GUIDE.md' +doc = None +with open(doc_file_name) as doc_file: + doc = doc_file.read() + +table_boundary = '' +existing_table_pattern = f"{table_boundary}([\S\s]*){table_boundary}" +existing_table_matches = re.search(existing_table_pattern, doc) +existing_table_lines = existing_table_matches.group(1).splitlines() + +new_table = f"{table_boundary}\n" +new_table += f"{existing_table_lines[1]}\n" +new_table += f"{existing_table_lines[2]}\n" + +existing_descriptions = {} +for line in existing_table_lines[3:]: + columns = line.split('|') + var = columns[1].strip(" `") + existing_descriptions[var] = columns[3].strip(" `") + +for var, val in all_vars.items(): + if val is not None: + if val == "": + val = f"`\"\"`" + else: + val = f"```{default_val}```" + description = "" + if var in existing_descriptions: + description = existing_descriptions[var] + new_table += f"| `{var}` | {val} | {description} |\n" + +new_table += table_boundary + +replace_doc_pattern = f"{table_boundary}[\S\s]*{table_boundary}" +new_doc = re.sub(replace_doc_pattern, new_table, doc) + +with open(doc_file_name, 'w') as doc_file: + doc_file.write(new_doc) diff --git a/hack/transform-al2-to-al2023.sh b/hack/transform-al2-to-al2023.sh new file mode 100755 index 000000000..d7ebd29b3 --- /dev/null +++ b/hack/transform-al2-to-al2023.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -o pipefail +set -o nounset +set -o errexit + +if [[ -z "${PACKER_TEMPLATE_FILE:-}" ]]; then + echo "PACKER_TEMPLATE_FILE must be set." >&2 + exit 1 +fi +if [[ -z "${PACKER_DEFAULT_VARIABLE_FILE:-}" ]]; then + echo "PACKER_DEFAULT_VARIABLE_FILE must be set." >&2 + exit 1 +fi + +# rsa keys are not supported in al2023, switch to ed25519 +# delete the upgrade kernel provisioner as we don't need it for al2023 +cat "${PACKER_TEMPLATE_FILE}" \ + | jq '._comment = "All template variables are enumerated here; and most variables have a default value defined in eks-worker-al2023-variables.json"' \ + | jq '.variables.temporary_key_pair_type = "ed25519"' \ + | jq '.provisioners |= map(select(.script//empty|endswith("upgrade_kernel.sh")|not))' \ + > "${PACKER_TEMPLATE_FILE/al2/al2023}" + +# use newer versions of containerd and runc, do not install docker +# use al2023 6.1 minimal image +cat "${PACKER_DEFAULT_VARIABLE_FILE}" \ + | jq '.ami_component_description = "(k8s: {{ user `kubernetes_version` }}, containerd: {{ user `containerd_version` }})"' \ + | jq '.ami_description = "EKS-optimized Kubernetes node based on Amazon Linux 2023"' \ + | jq '.containerd_version = "*" | .runc_version = "*" | .docker_version = "" ' \ + | jq '.source_ami_filter_name = "al2023-ami-minimal-2023.*-kernel-6.1-x86_64"' \ + | jq '.volume_type = "gp3"' \ + > "${PACKER_DEFAULT_VARIABLE_FILE/al2/al2023}" diff --git a/log-collector-script/linux/README.md b/log-collector-script/linux/README.md index 69bc088b3..4119e4410 100644 --- a/log-collector-script/linux/README.md +++ b/log-collector-script/linux/README.md @@ -91,7 +91,7 @@ Trying to archive gathered information... * SSM agent should be installed and running on Worker Node(s). [How to Install SSM Agent link](https://docs.aws.amazon.com/systems-manager/latest/userguide/sysman-manual-agent-install.html) -* Worker Node(s) should have required permissions to communicate with SSM service. IAM managed role `AmazonEC2RoleforSSM` will have all the required permission for SSM agent to run on EC2 instances. The IAM managed role `AmazonEC2RoleforSSM` has `S3:PutObject` permission to all S3 resources. +* Worker Node(s) should have required permissions to communicate with SSM service. IAM managed role `AmazonSSMManagedInstanceCore` will have all the required permission for SSM agent to run on EC2 instances. The IAM managed role `AmazonSSMManagedInstanceCore` has `S3:PutObject` permission to all S3 resources. *Note:* For more granular control of the IAM permission check [Actions defined by AWS Systems Manager](https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssystemsmanager.html%23awssystemsmanager-actions-as-permissions) diff --git a/log-collector-script/linux/eks-log-collector.sh b/log-collector-script/linux/eks-log-collector.sh index 351c3f103..c8e2048ae 100644 --- a/log-collector-script/linux/eks-log-collector.sh +++ b/log-collector-script/linux/eks-log-collector.sh @@ -20,7 +20,7 @@ export LANG="C" export LC_ALL="C" # Global options -readonly PROGRAM_VERSION="0.7.4" +readonly PROGRAM_VERSION="0.7.6" readonly PROGRAM_SOURCE="https://github.com/awslabs/amazon-eks-ami/blob/master/log-collector-script/" readonly PROGRAM_NAME="$(basename "$0" .sh)" readonly PROGRAM_DIR="/opt/log-collector" @@ -50,6 +50,7 @@ REQUIRED_UTILS=( COMMON_DIRECTORIES=( kernel + modinfo system docker containerd @@ -71,6 +72,7 @@ COMMON_LOGS=( pods # eks cloud-init.log cloud-init-output.log + user-data.log kube-proxy.log ) @@ -262,6 +264,7 @@ collect() { get_region get_common_logs get_kernel_info + get_modinfo get_mounts_info get_selinux_info get_iptables_info @@ -353,6 +356,7 @@ get_common_logs() { cp --force --dereference --recursive /var/log/containers/ebs-csi* "${COLLECT_DIR}"/var_log/ 2> /dev/null cp --force --dereference --recursive /var/log/containers/efs-csi* "${COLLECT_DIR}"/var_log/ 2> /dev/null cp --force --dereference --recursive /var/log/containers/fsx-csi* "${COLLECT_DIR}"/var_log/ 2> /dev/null + cp --force --dereference --recursive /var/log/containers/fsx-openzfs-csi* "${COLLECT_DIR}"/var_log/ 2> /dev/null cp --force --dereference --recursive /var/log/containers/file-cache-csi* "${COLLECT_DIR}"/var_log/ 2> /dev/null continue fi @@ -363,6 +367,9 @@ get_common_logs() { cp --force --dereference --recursive /var/log/pods/kube-system_kube-proxy* "${COLLECT_DIR}"/var_log/ 2> /dev/null cp --force --dereference --recursive /var/log/pods/kube-system_ebs-csi-* "${COLLECT_DIR}"/var_log/ 2> /dev/null cp --force --dereference --recursive /var/log/pods/kube-system_efs-csi-* "${COLLECT_DIR}"/var_log/ 2> /dev/null + cp --force --dereference --recursive /var/log/pods/kube-system_fsx-csi-* "${COLLECT_DIR}"/var_log/ 2> /dev/null + cp --force --dereference --recursive /var/log/pods/kube-system_fsx-openzfs-csi-* "${COLLECT_DIR}"/var_log/ 2> /dev/null + cp --force --dereference --recursive /var/log/pods/kube-system_file-cache-csi-* "${COLLECT_DIR}"/var_log/ 2> /dev/null continue fi cp --force --recursive --dereference /var/log/"${entry}" "${COLLECT_DIR}"/var_log/ 2> /dev/null @@ -385,6 +392,12 @@ get_kernel_info() { ok } +# collect modinfo on specific modules for debugging purposes +get_modinfo() { + try "collect modinfo" + modinfo lustre > "${COLLECT_DIR}/modinfo/lustre" +} + get_docker_logs() { try "collect Docker daemon logs" @@ -526,6 +539,14 @@ get_networking_info() { fi cp /etc/resolv.conf "${COLLECT_DIR}"/networking/resolv.conf + + # collect ethtool -S for all interfaces + INTERFACES=$(ip -o a | awk '{print $2}' | sort -n | uniq) + for ifc in ${INTERFACES}; do + echo "Interface ${ifc}" >> "${COLLECT_DIR}"/networking/ethtool.txt + ethtool -S ${ifc} >> "${COLLECT_DIR}"/networking/ethtool.txt 2>&1 + echo -e "\n" >> "${COLLECT_DIR}"/networking/ethtool.txt + done ok } diff --git a/log-collector-script/windows/README.md b/log-collector-script/windows/README.md index 374a4053b..945211c14 100644 --- a/log-collector-script/windows/README.md +++ b/log-collector-script/windows/README.md @@ -84,7 +84,7 @@ Done... your bundled logs are located in C:\log-collector\eks_i-0b318f704c74b6a * SSM agent should be installed and running on Worker Node(s). [How to Install SSM Agent link](https://docs.aws.amazon.com/systems-manager/latest/userguide/sysman-manual-agent-install.html) -* Worker Node(s) should have required permissions to communicate with SSM service. IAM managed role `AmazonEC2RoleforSSM` will have all the required permission for SSM agent to run on EC2 instances. The IAM managed role `AmazonEC2RoleforSSM` has `S3:PutObject` permission to all S3 resources. +* Worker Node(s) should have required permissions to communicate with SSM service. IAM managed role `AmazonSSMManagedInstanceCore` will have all the required permission for SSM agent to run on EC2 instances. The IAM managed role `AmazonSSMManagedInstanceCore` has `S3:PutObject` permission to all S3 resources. *Note:* For more granular control of the IAM permission check [Actions defined by AWS Systems Manager](https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssystemsmanager.html%23awssystemsmanager-actions-as-permissions) diff --git a/log-collector-script/windows/eks-log-collector.ps1 b/log-collector-script/windows/eks-log-collector.ps1 index f96916e7e..31fa84ba2 100644 --- a/log-collector-script/windows/eks-log-collector.ps1 +++ b/log-collector-script/windows/eks-log-collector.ps1 @@ -289,7 +289,7 @@ Function get_k8s_info{ Write-Host "Collecting kubelet information" copy C:\ProgramData\kubernetes\kubeconfig $info_system\kubelet\ copy C:\ProgramData\kubernetes\kubelet-config.json $info_system\kubelet\ - copy C:\ProgramData\Amazon\EKS\cni\config\vpc-shared-eni.conf $info_system\cni\ + copy C:\ProgramData\Amazon\EKS\cni\config\* $info_system\cni\ Write-Host "OK" -foregroundcolor "green" } catch { diff --git a/scripts/cleanup.sh b/scripts/cleanup.sh index 24861c3e9..f99893412 100644 --- a/scripts/cleanup.sh +++ b/scripts/cleanup.sh @@ -4,9 +4,6 @@ sudo yum clean all sudo rm -rf /var/cache/yum -# Clean up build artifacts -sudo rm -rf /tmp/worker - # Clean up files to reduce confusion during debug sudo rm -rf \ /etc/hostname \ diff --git a/scripts/generate-version-info.sh b/scripts/generate-version-info.sh index 9a52f42ce..3f75cc01d 100644 --- a/scripts/generate-version-info.sh +++ b/scripts/generate-version-info.sh @@ -20,4 +20,9 @@ echo $(jq ".binaries.kubelet = \"$(kubelet --version | awk '{print $2}')\"" $OUT echo $(jq ".binaries.awscli = \"$(aws --version | awk '{print $1}' | cut -d '/' -f 2)\"" $OUTPUT_FILE) > $OUTPUT_FILE # cached images -echo $(jq ".images = [ $(sudo ctr -n k8s.io image ls -q | cut -d'/' -f2- | sort | uniq | grep -v 'sha256' | xargs -r printf "\"%s\"," | sed 's/,$//') ]" $OUTPUT_FILE) > $OUTPUT_FILE +if systemctl is-active --quiet containerd; then + echo $(jq ".images = [ $(sudo ctr -n k8s.io image ls -q | cut -d'/' -f2- | sort | uniq | grep -v 'sha256' | xargs -r printf "\"%s\"," | sed 's/,$//') ]" $OUTPUT_FILE) > $OUTPUT_FILE +elif [ "${CACHE_CONTAINER_IMAGES}" = "true" ]; then + echo "containerd must be active to generate version info for cached images" + exit 1 +fi diff --git a/scripts/install-worker.sh b/scripts/install-worker.sh index 22333ce39..398858df9 100644 --- a/scripts/install-worker.sh +++ b/scripts/install-worker.sh @@ -6,8 +6,6 @@ set -o errexit IFS=$'\n\t' export AWS_DEFAULT_OUTPUT="json" -TEMPLATE_DIR=${TEMPLATE_DIR:-/tmp/worker} - ################################################################################ ### Validate Required Arguments ################################################ ################################################################################ @@ -33,6 +31,7 @@ validate_env_set KUBERNETES_BUILD_DATE validate_env_set PULL_CNI_FROM_GITHUB validate_env_set PAUSE_CONTAINER_VERSION validate_env_set CACHE_CONTAINER_IMAGES +validate_env_set WORKING_DIR ################################################################################ ### Machine Architecture ####################################################### @@ -60,8 +59,8 @@ sudo yum install -y \ aws-cfn-bootstrap \ chrony \ conntrack \ - curl \ ec2-instance-connect \ + ethtool \ ipvsadm \ jq \ nfs-utils \ @@ -69,37 +68,37 @@ sudo yum install -y \ unzip \ wget \ yum-utils \ - yum-plugin-versionlock + yum-plugin-versionlock \ + mdadm \ + pigz + +# skip kernel version cleanup on al2023 +if ! cat /etc/*release | grep "al2023" > /dev/null 2>&1; then + # Remove any old kernel versions. `--count=1` here means "only leave 1 kernel version installed" + sudo package-cleanup --oldkernels --count=1 -y +fi -# Remove any old kernel versions. `--count=1` here means "only leave 1 kernel version installed" -sudo package-cleanup --oldkernels --count=1 -y +# packages that need special handling +if cat /etc/*release | grep "al2023" > /dev/null 2>&1; then + # exists in al2023 only (needed by kubelet) + sudo yum install -y iptables-legacy +else + # curl-minimal already exists in al2023 so install curl only on al2 + sudo yum install -y curl +fi sudo yum versionlock kernel-$(uname -r) # Remove the ec2-net-utils package, if it's installed. This package interferes with the route setup on the instance. if yum list installed | grep ec2-net-utils; then sudo yum remove ec2-net-utils -y -q; fi +sudo mkdir -p /etc/eks/ + ################################################################################ ### Time ####################################################################### ################################################################################ -# Make sure Amazon Time Sync Service starts on boot. -sudo chkconfig chronyd on - -# Make sure that chronyd syncs RTC clock to the kernel. -cat << EOF | sudo tee -a /etc/chrony.conf -# This directive enables kernel synchronisation (every 11 minutes) of the -# real-time clock. Note that it can’t be used along with the 'rtcfile' directive. -rtcsync -EOF - -# If current clocksource is xen, switch to tsc -if grep --quiet xen /sys/devices/system/clocksource/clocksource0/current_clocksource \ - && grep --quiet tsc /sys/devices/system/clocksource/clocksource0/available_clocksource; then - echo "tsc" | sudo tee /sys/devices/system/clocksource/clocksource0/current_clocksource -else - echo "tsc as a clock source is not applicable, skipping." -fi +sudo mv $WORKING_DIR/configure-clocksource.service /etc/eks/configure-clocksource.service ################################################################################ ### SSH ######################################################################## @@ -112,17 +111,20 @@ sudo systemctl restart sshd.service ################################################################################ ### iptables ################################################################### ################################################################################ -sudo mkdir -p /etc/eks -sudo mv $TEMPLATE_DIR/iptables-restore.service /etc/eks/iptables-restore.service + +sudo mv $WORKING_DIR/iptables-restore.service /etc/eks/iptables-restore.service ################################################################################ ### awscli ##################################################### ################################################################################ -if [[ "$BINARY_BUCKET_REGION" != "us-iso-east-1" && "$BINARY_BUCKET_REGION" != "us-isob-east-1" ]]; then +### isolated regions can't communicate to awscli.amazonaws.com so installing awscli through yum +ISOLATED_REGIONS="${ISOLATED_REGIONS:-us-iso-east-1 us-iso-west-1 us-isob-east-1}" +if ! [[ ${ISOLATED_REGIONS} =~ $BINARY_BUCKET_REGION ]]; then # https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html echo "Installing awscli v2 bundle" - AWSCLI_DIR=$(mktemp -d) + AWSCLI_DIR="${WORKING_DIR}/awscli-install" + mkdir "${AWSCLI_DIR}" curl \ --silent \ --show-error \ @@ -140,7 +142,7 @@ fi ### systemd #################################################################### ################################################################################ -sudo mv "${TEMPLATE_DIR}/runtime.slice" /etc/systemd/system/runtime.slice +sudo mv "${WORKING_DIR}/runtime.slice" /etc/systemd/system/runtime.slice ############################################################################### ### Containerd setup ########################################################## @@ -159,13 +161,13 @@ if [ -f "/etc/eks/containerd/containerd-config.toml" ]; then ## this means we are building a gpu ami and have already placed a containerd configuration file in /etc/eks echo "containerd config is already present" else - sudo mv $TEMPLATE_DIR/containerd-config.toml /etc/eks/containerd/containerd-config.toml + sudo mv $WORKING_DIR/containerd-config.toml /etc/eks/containerd/containerd-config.toml fi -sudo mv $TEMPLATE_DIR/kubelet-containerd.service /etc/eks/containerd/kubelet-containerd.service -sudo mv $TEMPLATE_DIR/sandbox-image.service /etc/eks/containerd/sandbox-image.service -sudo mv $TEMPLATE_DIR/pull-sandbox-image.sh /etc/eks/containerd/pull-sandbox-image.sh -sudo mv $TEMPLATE_DIR/pull-image.sh /etc/eks/containerd/pull-image.sh +sudo mv $WORKING_DIR/kubelet-containerd.service /etc/eks/containerd/kubelet-containerd.service +sudo mv $WORKING_DIR/sandbox-image.service /etc/eks/containerd/sandbox-image.service +sudo mv $WORKING_DIR/pull-sandbox-image.sh /etc/eks/containerd/pull-sandbox-image.sh +sudo mv $WORKING_DIR/pull-image.sh /etc/eks/containerd/pull-image.sh sudo chmod +x /etc/eks/containerd/pull-sandbox-image.sh sudo chmod +x /etc/eks/containerd/pull-image.sh @@ -212,7 +214,7 @@ if [[ "$INSTALL_DOCKER" == "true" ]]; then sudo sed -i '/OPTIONS/d' /etc/sysconfig/docker sudo mkdir -p /etc/docker - sudo mv $TEMPLATE_DIR/docker-daemon.json /etc/docker/daemon.json + sudo mv $WORKING_DIR/docker-daemon.json /etc/docker/daemon.json sudo chown root:root /etc/docker/daemon.json # Enable docker daemon to start on boot. @@ -225,8 +227,8 @@ fi # kubelet uses journald which has built-in rotation and capped size. # See man 5 journald.conf -sudo mv $TEMPLATE_DIR/logrotate-kube-proxy /etc/logrotate.d/kube-proxy -sudo mv $TEMPLATE_DIR/logrotate.conf /etc/logrotate.conf +sudo mv $WORKING_DIR/logrotate-kube-proxy /etc/logrotate.d/kube-proxy +sudo mv $WORKING_DIR/logrotate.conf /etc/logrotate.conf sudo chown root:root /etc/logrotate.d/kube-proxy sudo chown root:root /etc/logrotate.conf sudo mkdir -p /var/log/journal @@ -244,7 +246,7 @@ echo "Downloading binaries from: s3://$BINARY_BUCKET_NAME" S3_DOMAIN="amazonaws.com" if [ "$BINARY_BUCKET_REGION" = "cn-north-1" ] || [ "$BINARY_BUCKET_REGION" = "cn-northwest-1" ]; then S3_DOMAIN="amazonaws.com.cn" -elif [ "$BINARY_BUCKET_REGION" = "us-iso-east-1" ]; then +elif [ "$BINARY_BUCKET_REGION" = "us-iso-east-1" ] || [ "$BINARY_BUCKET_REGION" = "us-iso-west-1" ]; then S3_DOMAIN="c2s.ic.gov" elif [ "$BINARY_BUCKET_REGION" = "us-isob-east-1" ]; then S3_DOMAIN="sc2s.sgov.gov" @@ -309,19 +311,19 @@ sudo rm ./*.sha256 sudo mkdir -p /etc/kubernetes/kubelet sudo mkdir -p /etc/systemd/system/kubelet.service.d -sudo mv $TEMPLATE_DIR/kubelet-kubeconfig /var/lib/kubelet/kubeconfig +sudo mv $WORKING_DIR/kubelet-kubeconfig /var/lib/kubelet/kubeconfig sudo chown root:root /var/lib/kubelet/kubeconfig # Inject CSIServiceAccountToken feature gate to kubelet config if kubernetes version starts with 1.20. # This is only injected for 1.20 since CSIServiceAccountToken will be moved to beta starting 1.21. if [[ $KUBERNETES_VERSION == "1.20"* ]]; then - KUBELET_CONFIG_WITH_CSI_SERVICE_ACCOUNT_TOKEN_ENABLED=$(cat $TEMPLATE_DIR/kubelet-config.json | jq '.featureGates += {CSIServiceAccountToken: true}') - echo $KUBELET_CONFIG_WITH_CSI_SERVICE_ACCOUNT_TOKEN_ENABLED > $TEMPLATE_DIR/kubelet-config.json + KUBELET_CONFIG_WITH_CSI_SERVICE_ACCOUNT_TOKEN_ENABLED=$(cat $WORKING_DIR/kubelet-config.json | jq '.featureGates += {CSIServiceAccountToken: true}') + echo $KUBELET_CONFIG_WITH_CSI_SERVICE_ACCOUNT_TOKEN_ENABLED > $WORKING_DIR/kubelet-config.json fi -sudo mv $TEMPLATE_DIR/kubelet.service /etc/systemd/system/kubelet.service +sudo mv $WORKING_DIR/kubelet.service /etc/systemd/system/kubelet.service sudo chown root:root /etc/systemd/system/kubelet.service -sudo mv $TEMPLATE_DIR/kubelet-config.json /etc/kubernetes/kubelet/kubelet-config.json +sudo mv $WORKING_DIR/kubelet-config.json /etc/kubernetes/kubelet/kubelet-config.json sudo chown root:root /etc/kubernetes/kubelet/kubelet-config.json sudo systemctl daemon-reload @@ -333,38 +335,35 @@ sudo systemctl disable kubelet ################################################################################ sudo mkdir -p /etc/eks -sudo mv $TEMPLATE_DIR/get-ecr-uri.sh /etc/eks/get-ecr-uri.sh +sudo mv $WORKING_DIR/get-ecr-uri.sh /etc/eks/get-ecr-uri.sh sudo chmod +x /etc/eks/get-ecr-uri.sh -sudo mv $TEMPLATE_DIR/eni-max-pods.txt /etc/eks/eni-max-pods.txt -sudo mv $TEMPLATE_DIR/bootstrap.sh /etc/eks/bootstrap.sh +sudo mv $WORKING_DIR/eni-max-pods.txt /etc/eks/eni-max-pods.txt +sudo mv $WORKING_DIR/bootstrap.sh /etc/eks/bootstrap.sh sudo chmod +x /etc/eks/bootstrap.sh -sudo mv $TEMPLATE_DIR/max-pods-calculator.sh /etc/eks/max-pods-calculator.sh +sudo mv $WORKING_DIR/max-pods-calculator.sh /etc/eks/max-pods-calculator.sh sudo chmod +x /etc/eks/max-pods-calculator.sh ################################################################################ ### ECR CREDENTIAL PROVIDER #################################################### ################################################################################ -if vercmp "$KUBERNETES_VERSION" gteq "1.22.0"; then - ECR_BINARY="ecr-credential-provider" - if [[ -n "$AWS_ACCESS_KEY_ID" ]]; then - echo "AWS cli present - using it to copy ecr-credential-provider binaries from s3." - aws s3 cp --region $BINARY_BUCKET_REGION $S3_PATH/$ECR_BINARY . - else - echo "AWS cli missing - using wget to fetch ecr-credential-provider binaries from s3. Note: This won't work for private bucket." - sudo wget "$S3_URL_BASE/$ECR_BINARY" - fi - sudo chmod +x $ECR_BINARY - sudo mkdir -p /etc/eks/ecr-credential-provider - sudo mv $ECR_BINARY /etc/eks/ecr-credential-provider - - # copying credential provider config file to eks folder - sudo mv $TEMPLATE_DIR/ecr-credential-provider-config /etc/eks/ecr-credential-provider/ecr-credential-provider-config +ECR_CREDENTIAL_PROVIDER_BINARY="ecr-credential-provider" +if [[ -n "$AWS_ACCESS_KEY_ID" ]]; then + echo "AWS cli present - using it to copy ${ECR_CREDENTIAL_PROVIDER_BINARY} from s3." + aws s3 cp --region $BINARY_BUCKET_REGION $S3_PATH/$ECR_CREDENTIAL_PROVIDER_BINARY . +else + echo "AWS cli missing - using wget to fetch ${ECR_CREDENTIAL_PROVIDER_BINARY} from s3. Note: This won't work for private bucket." + sudo wget "$S3_URL_BASE/$ECR_CREDENTIAL_PROVIDER_BINARY" fi +sudo chmod +x $ECR_CREDENTIAL_PROVIDER_BINARY +sudo mkdir -p /etc/eks/image-credential-provider +sudo mv $ECR_CREDENTIAL_PROVIDER_BINARY /etc/eks/image-credential-provider/ +sudo mv $WORKING_DIR/ecr-credential-provider-config.json /etc/eks/image-credential-provider/config.json ################################################################################ ### Cache Images ############################################################### ################################################################################ -if [[ "$CACHE_CONTAINER_IMAGES" == "true" && "$BINARY_BUCKET_REGION" != "us-iso-east-1" && "$BINARY_BUCKET_REGION" != "us-isob-east-1" ]]; then + +if [[ "$CACHE_CONTAINER_IMAGES" == "true" ]] && ! [[ ${ISOLATED_REGIONS} =~ $BINARY_BUCKET_REGION ]]; then AWS_DOMAIN=$(imds 'latest/meta-data/services/domain') ECR_URI=$(/etc/eks/get-ecr-uri.sh "${BINARY_BUCKET_REGION}" "${AWS_DOMAIN}") @@ -481,13 +480,13 @@ sudo yum install -y amazon-ssm-agent ################################################################################ BASE_AMI_ID=$(imds /latest/meta-data/ami-id) -cat << EOF > /tmp/release +cat << EOF > "${WORKING_DIR}/release" BASE_AMI_ID="$BASE_AMI_ID" BUILD_TIME="$(date)" BUILD_KERNEL="$(uname -r)" ARCH="$(uname -m)" EOF -sudo mv /tmp/release /etc/eks/release +sudo mv "${WORKING_DIR}/release" /etc/eks/release sudo chown -R root:root /etc/eks ################################################################################ @@ -512,7 +511,7 @@ echo vm.max_map_count=524288 | sudo tee -a /etc/sysctl.conf ### adding log-collector-script ################################################ ################################################################################ sudo mkdir -p /etc/eks/log-collector-script/ -sudo cp $TEMPLATE_DIR/log-collector-script/eks-log-collector.sh /etc/eks/log-collector-script/ +sudo cp $WORKING_DIR/log-collector-script/eks-log-collector.sh /etc/eks/log-collector-script/ ################################################################################ ### Remove Yum Update from cloud-init config ################################### diff --git a/scripts/upgrade_kernel.sh b/scripts/upgrade_kernel.sh index 67e509caa..52d696056 100755 --- a/scripts/upgrade_kernel.sh +++ b/scripts/upgrade_kernel.sh @@ -24,4 +24,10 @@ sudo grubby \ --update-kernel=ALL \ --args="psi=1" +# use the tsc clocksource by default +# https://repost.aws/knowledge-center/manage-ec2-linux-clock-source +sudo grubby \ + --update-kernel=ALL \ + --args="clocksource=tsc tsc=reliable" + sudo reboot diff --git a/scripts/validate.sh b/scripts/validate.sh index ae329005e..da6a31627 100644 --- a/scripts/validate.sh +++ b/scripts/validate.sh @@ -45,8 +45,6 @@ else exit 1 fi -echo "Verifying that the package versionlocks are correct..." - function versionlock-entries() { # the format of this output is EPOCH:NAME-VERSION-RELEASE.ARCH # more info in yum-versionlock(1) @@ -58,18 +56,37 @@ function versionlock-packages() { versionlock-entries | xargs -I '{}' rpm --query '{}' --queryformat '%{NAME}\n' } -for ENTRY in $(versionlock-entries); do - if ! rpm --query "$ENTRY" &> /dev/null; then - echo "There is no package matching the versionlock entry: '$ENTRY'" - exit 1 +function verify-versionlocks() { + for ENTRY in $(versionlock-entries); do + if ! rpm --query "$ENTRY" &> /dev/null; then + echo "There is no package matching the versionlock entry: '$ENTRY'" + exit 1 + fi + done + + LOCKED_PACKAGES=$(versionlock-packages | wc -l) + UNIQUE_LOCKED_PACKAGES=$(versionlock-packages | sort -u | wc -l) + if [ $LOCKED_PACKAGES -ne $UNIQUE_LOCKED_PACKAGES ]; then + echo "Package(s) have multiple version locks!" + versionlock-entries fi -done -LOCKED_PACKAGES=$(versionlock-packages | wc -l) -UNIQUE_LOCKED_PACKAGES=$(versionlock-packages | sort -u | wc -l) -if [ $LOCKED_PACKAGES -ne $UNIQUE_LOCKED_PACKAGES ]; then - echo "Package(s) have multiple version locks!" - versionlock-entries + echo "Package versionlocks are correct!" +} + +# run verify-versionlocks on al2 only, as it is not needed on al2023 +if ! cat /etc/*release | grep "al2023" > /dev/null 2>&1; then + echo "Verifying that the package versionlocks are correct..." + verify-versionlocks fi -echo "Package versionlocks are correct!" +REQUIRED_COMMANDS=(unpigz) + +for ENTRY in "${REQUIRED_COMMANDS[@]}"; do + if ! command -v "$ENTRY" > /dev/null; then + echo "Required command does not exist: '$ENTRY'" + exit 1 + fi +done + +echo "Required commands were found: ${REQUIRED_COMMANDS[*]}" diff --git a/test/Dockerfile b/test/Dockerfile index bab93ee84..d00837c3e 100644 --- a/test/Dockerfile +++ b/test/Dockerfile @@ -7,12 +7,13 @@ RUN amazon-linux-extras enable docker && \ ENV IMDS_ENDPOINT=127.0.0.1:1338 COPY --from=aemm /ec2-metadata-mock /sbin/ec2-metadata-mock +RUN mkdir -p /etc/systemd/system RUN mkdir -p /etc/eks/containerd COPY files/ /etc/eks/ COPY files/containerd-config.toml files/kubelet-containerd.service files/pull-sandbox-image.sh files/sandbox-image.service /etc/eks/containerd/ COPY files/kubelet-config.json /etc/kubernetes/kubelet/kubelet-config.json COPY files/kubelet-kubeconfig /var/lib/kubelet/kubeconfig -COPY files/ecr-credential-provider-config /etc/eks/ecr-credential-provider/ecr-credential-provider-config +COPY files/ecr-credential-provider-config.json /etc/eks/image-credential-provider/config.json COPY test/entrypoint.sh /entrypoint.sh COPY files/bin/* /usr/bin/ COPY test/mocks/ /sbin/ diff --git a/test/cases/ecr-credential-provider-config.sh b/test/cases/ecr-credential-provider-config.sh index 5d4856ed1..4eb74a761 100755 --- a/test/cases/ecr-credential-provider-config.sh +++ b/test/cases/ecr-credential-provider-config.sh @@ -4,7 +4,7 @@ set -euo pipefail exit_code=0 TEMP_DIR=$(mktemp -d) -export CRED_PROVIDER_FILE="/etc/eks/ecr-credential-provider/ecr-credential-provider-config" +export CRED_PROVIDER_FILE="/etc/eks/image-credential-provider/config.json" export CRED_PROVIDER_RESET_FILE="./cred-provider-config" # Store the original version of the config @@ -15,7 +15,7 @@ function reset_scenario { cp $CRED_PROVIDER_RESET_FILE $CRED_PROVIDER_FILE } -echo "--> Should default to credentialprovider.kubelet.k8s.io/v1alpha1 and kubelet.config.k8s.io/v1alpha1 when below k8s version 1.24" +echo "--> Should default to credentialprovider.kubelet.k8s.io/v1alpha1 and kubelet.config.k8s.io/v1alpha1 when below k8s version 1.27" reset_scenario # This variable is used to override the default value in the kubelet mock @@ -31,23 +31,24 @@ if [[ ${exit_code} -ne 0 ]]; then fi expected_cred_provider_api="credentialprovider.kubelet.k8s.io/v1alpha1" -actual=$(yq e '.providers[0].apiVersion' $CRED_PROVIDER_FILE) +actual=$(jq -r '.providers[0].apiVersion' $CRED_PROVIDER_FILE) if [[ "$expected_cred_provider_api" != "$actual" ]]; then echo "❌ Test Failed: expected 1.22 credential provider file to contain $expected_cred_provider_api" exit 1 fi expected_kubelet_config_api="kubelet.config.k8s.io/v1alpha1" -actual=$(yq e '.apiVersion' $CRED_PROVIDER_FILE) +actual=$(jq -r '.apiVersion' $CRED_PROVIDER_FILE) if [[ "$expected_kubelet_config_api" != "$actual" ]]; then echo "❌ Test Failed: expected 1.22 credential provider file to contain $expected_kubelet_config_api" exit 1 fi -echo "--> Should default to credentialprovider.kubelet.k8s.io/v1beta1 and kubelet.config.k8s.io/v1beta1 when at or above k8s version 1.24" +echo "--> Should default to credentialprovider.kubelet.k8s.io/v1alpha1 and kubelet.config.k8s.io/v1alpha1 when below k8s version 1.27" reset_scenario -export KUBELET_VERSION=v1.24.15-eks-ba74326 +# This variable is used to override the default value in the kubelet mock +export KUBELET_VERSION=v1.26.0-eks-ba74326 /etc/eks/bootstrap.sh \ --b64-cluster-ca dGVzdA== \ --apiserver-endpoint http://my-api-endpoint \ @@ -58,18 +59,44 @@ if [[ ${exit_code} -ne 0 ]]; then exit 1 fi -expected_cred_provider_api="credentialprovider.kubelet.k8s.io/v1beta1" -actual=$(yq e '.providers[0].apiVersion' $CRED_PROVIDER_FILE) +expected_cred_provider_api="credentialprovider.kubelet.k8s.io/v1alpha1" +actual=$(jq -r '.providers[0].apiVersion' $CRED_PROVIDER_FILE) if [[ "$expected_cred_provider_api" != "$actual" ]]; then - echo "❌ Test Failed: expected 1.24 credential provider file to contain $expected_cred_provider_api" + echo "❌ Test Failed: expected 1.26 credential provider file to contain $expected_cred_provider_api" exit 1 fi -expected_kubelet_config_api="kubelet.config.k8s.io/v1beta1" -actual=$(yq e '.apiVersion' $CRED_PROVIDER_FILE) +expected_kubelet_config_api="kubelet.config.k8s.io/v1alpha1" +actual=$(jq -r '.apiVersion' $CRED_PROVIDER_FILE) if [[ "$expected_kubelet_config_api" != "$actual" ]]; then - echo "❌ Test Failed: expected 1.24 credential provider file to contain $expected_kubelet_config_api" + echo "❌ Test Failed: expected 1.26 credential provider file to contain $expected_kubelet_config_api" exit 1 fi -exit_code=0 +echo "--> Should default to credentialprovider.kubelet.k8s.io/v1 and kubelet.config.k8s.io/v1 when at or above k8s version 1.27" +reset_scenario + +export KUBELET_VERSION=v1.27.1-eks-ba74326 +/etc/eks/bootstrap.sh \ + --b64-cluster-ca dGVzdA== \ + --apiserver-endpoint http://my-api-endpoint \ + test || exit_code=$? + +if [[ ${exit_code} -ne 0 ]]; then + echo "❌ Test Failed: expected a zero exit code but got '${exit_code}'" + exit 1 +fi + +expected_cred_provider_api="credentialprovider.kubelet.k8s.io/v1" +actual=$(jq -r '.providers[0].apiVersion' $CRED_PROVIDER_FILE) +if [[ "$expected_cred_provider_api" != "$actual" ]]; then + echo "❌ Test Failed: expected 1.27 credential provider file to contain $expected_cred_provider_api" + exit 1 +fi + +expected_kubelet_config_api="kubelet.config.k8s.io/v1" +actual=$(jq -r '.apiVersion' $CRED_PROVIDER_FILE) +if [[ "$expected_kubelet_config_api" != "$actual" ]]; then + echo "❌ Test Failed: expected 1.27 credential provider file to contain $expected_kubelet_config_api" + exit 1 +fi diff --git a/test/cases/mount-bpf-fs.sh b/test/cases/mount-bpf-fs.sh index e8ef5da99..c5281d4e2 100755 --- a/test/cases/mount-bpf-fs.sh +++ b/test/cases/mount-bpf-fs.sh @@ -61,19 +61,19 @@ EXIT_CODE=0 /etc/eks/bootstrap.sh \ --b64-cluster-ca dGVzdA== \ --apiserver-endpoint http://my-api-endpoint \ - test || exit_code=$? + test || EXIT_CODE=$? if [[ ${EXIT_CODE} -ne 0 ]]; then echo "❌ Test Failed: expected a zero exit code but got '${EXIT_CODE}'" exit 1 fi -if [ "$(cat $MOUNT_BPF_FS_MOCK)" = "called" ]; then +if [ ! "$(cat $MOUNT_BPF_FS_MOCK)" = "called" ]; then echo "❌ Test Failed: expected mount-bpf-fs to be called once but it was not!" exit 1 fi export -nf mount-bpf-fs -echo "--> Should default to false on 1.26-" -export KUBELET_VERSION=v1.26.0-eks-ba74326 +echo "--> Should default to false on 1.24-" +export KUBELET_VERSION=v1.24.0-eks-ba74326 MOUNT_BPF_FS_MOCK=$(mktemp) function mount-bpf-fs() { echo "called" >> $MOUNT_BPF_FS_MOCK @@ -84,7 +84,7 @@ EXIT_CODE=0 /etc/eks/bootstrap.sh \ --b64-cluster-ca dGVzdA== \ --apiserver-endpoint http://my-api-endpoint \ - test || exit_code=$? + test || EXIT_CODE=$? if [[ ${EXIT_CODE} -ne 0 ]]; then echo "❌ Test Failed: expected a zero exit code but got '${EXIT_CODE}'" exit 1 diff --git a/test/test-harness.sh b/test/test-harness.sh index b0cc2180f..c253f562e 100755 --- a/test/test-harness.sh +++ b/test/test-harness.sh @@ -37,12 +37,14 @@ done docker build -t eks-optimized-ami -f "${SCRIPTPATH}/Dockerfile" "${SCRIPTPATH}/../" overall_status=0 +test_run_log_file=$(mktemp) + function run() { docker run -v "$(realpath $1):/test.sh" \ --attach STDOUT \ --attach STDERR \ --rm \ - eks-optimized-ami + eks-optimized-ami > $test_run_log_file 2>&1 } if [[ ! -z ${TEST_CASE_SCRIPT} ]]; then @@ -59,6 +61,7 @@ for case in "${test_cases[@]}"; do if [[ ${status} -eq 0 ]]; then echo "✅ ✅ $(basename ${case}) Tests Passed! ✅ ✅" else + cat $test_run_log_file echo "❌ ❌ $(basename ${case}) Tests Failed! ❌ ❌" overall_status=1 fi