From 6ce39945cd3d6cea7a3775055c22be94877052e0 Mon Sep 17 00:00:00 2001 From: Luke Hoban Date: Wed, 3 Jul 2019 10:49:02 -0700 Subject: [PATCH 1/4] Wait for cluster endpoint Try out sleeping for 10s to see if that addresses availability issues. If so, we likely want to make a more correct fix to poll the API endpoint instead. --- nodejs/eks/cluster.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nodejs/eks/cluster.ts b/nodejs/eks/cluster.ts index a5a7ec83a..03b900765 100644 --- a/nodejs/eks/cluster.ts +++ b/nodejs/eks/cluster.ts @@ -163,7 +163,7 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co // Compute the required kubeconfig. Note that we do not export this value: we want the exported config to // depend on the autoscaling group we'll create later so that nothing attempts to use the EKS cluster before // its worker nodes have come up. - const kubeconfig = pulumi.all([eksCluster.name, eksCluster.endpoint, eksCluster.certificateAuthority]) + const kubeconfigInitial = pulumi.all([eksCluster.name, eksCluster.endpoint, eksCluster.certificateAuthority]) .apply(([clusterName, clusterEndpoint, clusterCertificateAuthority]) => { return { apiVersion: "v1", @@ -196,6 +196,14 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co }; }); + const kubeconfig = kubeconfigInitial.apply(async (kcfg) => { + if (!pulumi.runtime.isDryRun()) { + pulumi.log.info("Waiting for cluster endpoint...", eksCluster, undefined, true); + await new Promise((res) => setTimeout(res, 10000)); + } + return kcfg; + }); + const provider = new k8s.Provider(`${name}-eks-k8s`, { kubeconfig: kubeconfig.apply(JSON.stringify), }, { parent: parent }); From 04a6070a457b80f4dfd418f4b0692c7454525d6f Mon Sep 17 00:00:00 2001 From: Luke Hoban Date: Wed, 3 Jul 2019 15:20:23 -0700 Subject: [PATCH 2/4] Wait a lot longer... --- nodejs/eks/cluster.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodejs/eks/cluster.ts b/nodejs/eks/cluster.ts index 03b900765..abafd7d38 100644 --- a/nodejs/eks/cluster.ts +++ b/nodejs/eks/cluster.ts @@ -199,7 +199,7 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co const kubeconfig = kubeconfigInitial.apply(async (kcfg) => { if (!pulumi.runtime.isDryRun()) { pulumi.log.info("Waiting for cluster endpoint...", eksCluster, undefined, true); - await new Promise((res) => setTimeout(res, 10000)); + await new Promise(resolve => setTimeout(resolve, 5*60*1000)); } return kcfg; }); From e1f772eff88efc712e63fb28fdbd5f4f47472af5 Mon Sep 17 00:00:00 2001 From: Luke Hoban Date: Wed, 3 Jul 2019 16:46:25 -0700 Subject: [PATCH 3/4] Test cluster endpoint health Instead of always waiting 5 minutes, instead try to hit the cluster endpoint every 5 seconds, continuing on as soon as we can. If we still can't reach it after 5 minutes, we allow the operation to continue (presumably to fail on some other operation). --- nodejs/eks/cluster.ts | 48 ++++++++++++++++++++++++++++------------- nodejs/eks/package.json | 2 ++ 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/nodejs/eks/cluster.ts b/nodejs/eks/cluster.ts index abafd7d38..4b061f60c 100644 --- a/nodejs/eks/cluster.ts +++ b/nodejs/eks/cluster.ts @@ -15,8 +15,10 @@ import * as aws from "@pulumi/aws"; import * as k8s from "@pulumi/kubernetes"; import * as pulumi from "@pulumi/pulumi"; +import * as https from "https"; import * as jsyaml from "js-yaml"; -import which = require("which"); +import fetch from "node-fetch"; +import * as which from "which"; import { VpcCni, VpcCniOptions } from "./cni"; import { createDashboard } from "./dashboard"; @@ -142,7 +144,7 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co fromPort: 0, toPort: 0, protocol: "-1", // all - cidrBlocks: [ "0.0.0.0/0" ], + cidrBlocks: ["0.0.0.0/0"], securityGroupId: eksClusterSecurityGroup.id, }, { parent: parent }); } @@ -160,10 +162,34 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co enabledClusterLogTypes: args.enabledClusterLogTypes, }, { parent: parent }); + // Instead of using the kubeconfig directly, we also add a wait of up to 5 minutes or until we + // can reach the API server to the Output that provides access to the kubeconfig string so that + // there is time for the target cluster API server to become completely available. Ideally we + // would rely on the EKS API only returning once this was available, but we have seen frequent + // cases where it is not yet available immediately after provisioning - possibly due to DNS + // propagation delay or other non-deterministic factors. + const endpoint = eksCluster.endpoint.apply(async (clusterEndpoint) => { + if (!pulumi.runtime.isDryRun()) { + // For up to 300 seconds, try to contact the API cluster endpoint and verify that it is reachable. + const agent = new https.Agent({ rejectUnauthorized: false }); + for (let i = 0; i < 60; i++) { + try { + const resp = await fetch(clusterEndpoint, { agent }); + await resp.json(); + break; + } catch (e) { + pulumi.log.info(`Waiting for cluster endpoint (${i + 1})`, eksCluster, undefined, true); + } + await new Promise(resolve => setTimeout(resolve, 5 * 1000)); + } + } + return clusterEndpoint; + }); + // Compute the required kubeconfig. Note that we do not export this value: we want the exported config to // depend on the autoscaling group we'll create later so that nothing attempts to use the EKS cluster before // its worker nodes have come up. - const kubeconfigInitial = pulumi.all([eksCluster.name, eksCluster.endpoint, eksCluster.certificateAuthority]) + const kubeconfig = pulumi.all([eksCluster.name, endpoint, eksCluster.certificateAuthority]) .apply(([clusterName, clusterEndpoint, clusterCertificateAuthority]) => { return { apiVersion: "v1", @@ -196,14 +222,6 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co }; }); - const kubeconfig = kubeconfigInitial.apply(async (kcfg) => { - if (!pulumi.runtime.isDryRun()) { - pulumi.log.info("Waiting for cluster endpoint...", eksCluster, undefined, true); - await new Promise(resolve => setTimeout(resolve, 5*60*1000)); - } - return kcfg; - }); - const provider = new k8s.Provider(`${name}-eks-k8s`, { kubeconfig: kubeconfig.apply(JSON.stringify), }, { parent: parent }); @@ -537,7 +555,7 @@ export interface ClusterOptions { * ClusterNodeGroupOptions describes the configuration options accepted by a cluster * to create its own node groups. It's a subset of NodeGroupOptions. */ -export interface ClusterNodeGroupOptions extends NodeGroupBaseOptions {} +export interface ClusterNodeGroupOptions extends NodeGroupBaseOptions { } /** * Cluster is a component that wraps the AWS and Kubernetes resources necessary to run an EKS cluster, its worker @@ -689,8 +707,8 @@ export class Cluster extends pulumi.ComponentResource { nodeSecurityGroup: this.nodeSecurityGroup, clusterIngressRule: this.eksClusterIngressRule, }, { - parent: this, - providers: { kubernetes: this.provider }, - }); + parent: this, + providers: { kubernetes: this.provider }, + }); } } diff --git a/nodejs/eks/package.json b/nodejs/eks/package.json index e4af64207..f3b5c5367 100644 --- a/nodejs/eks/package.json +++ b/nodejs/eks/package.json @@ -15,11 +15,13 @@ "@pulumi/kubernetes": "^0.24.0", "@pulumi/pulumi": "^0.17.8", "netmask": "^1.0.6", + "axios": "^0.19.0", "which": "^1.3.1" }, "devDependencies": { "@types/netmask": "^1.0.30", "@types/node": "^8.0.26", + "@types/axios": "^0.14.0", "@types/which": "^1.3.1", "tslint": "^5.7.0", "typescript": "^2.6.2" From f4d235a306647a4c0590e002c8ec445f26416c79 Mon Sep 17 00:00:00 2001 From: metral <1112768+metral@users.noreply.github.com> Date: Wed, 3 Jul 2019 20:30:49 -0400 Subject: [PATCH 4/4] Apply suggestions from code review --- nodejs/eks/cluster.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nodejs/eks/cluster.ts b/nodejs/eks/cluster.ts index 4b061f60c..70b57b329 100644 --- a/nodejs/eks/cluster.ts +++ b/nodejs/eks/cluster.ts @@ -163,8 +163,8 @@ export function createCore(name: string, args: ClusterOptions, parent: pulumi.Co }, { parent: parent }); // Instead of using the kubeconfig directly, we also add a wait of up to 5 minutes or until we - // can reach the API server to the Output that provides access to the kubeconfig string so that - // there is time for the target cluster API server to become completely available. Ideally we + // can reach the API server for the Output that provides access to the kubeconfig string so that + // there is time for the cluster API server to become completely available. Ideally we // would rely on the EKS API only returning once this was available, but we have seen frequent // cases where it is not yet available immediately after provisioning - possibly due to DNS // propagation delay or other non-deterministic factors.