From f5abbf97db74614e73b1022dd44c274f8dc035a7 Mon Sep 17 00:00:00 2001 From: pingcap-github-bot Date: Fri, 13 Mar 2020 20:00:23 +0800 Subject: [PATCH] Add periodical e2e job for EKS (#1915) (#1926) --- ci/aws-clean-eks.sh | 105 ++++++++++++++++ ci/e2e_eks.groovy | 137 +++++++++++++++++++++ ci/e2e_gke.groovy | 6 +- ci/pingcap_tidb_operator_build_kind.groovy | 2 +- hack/e2e.sh | 15 +-- hack/lib.sh | 4 +- hack/run-e2e.sh | 50 ++++---- hack/run-in-container.sh | 2 +- tests/e2e/e2e.go | 4 +- 9 files changed, 287 insertions(+), 38 deletions(-) create mode 100755 ci/aws-clean-eks.sh create mode 100644 ci/e2e_eks.groovy diff --git a/ci/aws-clean-eks.sh b/ci/aws-clean-eks.sh new file mode 100755 index 0000000000..69f11ec570 --- /dev/null +++ b/ci/aws-clean-eks.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Copyright 2020 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# aws-k8s-tester cannot clean all resources created when some error happened. +# This script is used to clean resources created by aws-k8s-tester in our CI. +# +# DO NOT USE THIS SCRIPT FOR OTHER USES! +# + +function get_stacks() { + aws cloudformation list-stacks --stack-status-filter CREATE_COMPLETE DELETE_FAILED --query 'StackSummaries[*].StackName' --output text +} + +function fix_eks_mng_deletion_issues() { + local cluster="$1" + local mng="$2" + while IFS=$'\n' read -r line; do + read -r code resourceIds <<< $line + if [ "$code" == "Ec2SecurityGroupDeletionFailure" ]; then + echo "info: clear security group '$resourceIds'" + for eni in $(aws ec2 describe-network-interfaces --filters "Name=group-id,Values=$resourceIds" --query 'NetworkInterfaces[*].NetworkInterfaceId' --output text); do + echo "info: clear leaked network interfaces '$eni'" + aws ec2 delete-network-interface --network-interface-id "$eni" + done + aws ec2 delete-security-group --group-id $resourceIds + fi + done <<< $(aws eks describe-nodegroup --cluster-name "$cluster" --nodegroup-name "$mng" --query 'nodegroup.health.issues' --output json | jq -r '.[].resourceIds |= join(",") | .[] | "\(.code)\t\(.resourceIds)"') +} + +function clean_eks() { + local CLUSTER="$1" + echo "info: deleting mng stack" + local regex='^'$CLUSTER'-mng-[0-9]+$' + local mngStack= + for stackName in $(get_stacks); do + if [[ ! "$stackName" =~ $regex ]]; then + continue + fi + mngStack=$stackName + break + done + if [ -n "$mngStack" ]; then + echo "info: mng stack found '$mngStack', deleting it" + aws cloudformation delete-stack --stack-name $mngStack + aws cloudformation wait stack-delete-complete --stack-name $mngStack + if [ $? -ne 0 ]; then + echo "error: failed to delete mng stack '$mngStack', delete related resource first" + for mngName in $(aws eks list-nodegroups --cluster-name jenkins-tidb-operator-e2e2 --query 'nodegroups[*]' --output text); do + fix_eks_mng_deletion_issues "$CLUSTER" $mngName + done + aws cloudformation delete-stack --stack-name $mngStack + aws cloudformation wait stack-delete-complete --stack-name $mngStack + fi + else + echo "info: mng stack not found, skipped" + fi + + echo "info: deleting cluster/cluster-role/mng-role/vpc stacks" + local stacks=( + $CLUSTER-cluster + $CLUSTER-role-cluster + $CLUSTER-role-mng + $CLUSTER-vpc + ) + for stack in ${stacks[@]}; do + echo "info: deleting stack $stack" + aws cloudformation delete-stack --stack-name $stack + aws cloudformation wait stack-delete-complete --stack-name $stack + done +} + +# https://github.com/aws/aws-cli#other-configurable-variables +if [ -n "${AWS_REGION}" ]; then + export AWS_DEFAULT_REGION=${AWS_REGION:-} +fi + +aws sts get-caller-identity +if [ $? -ne 0 ]; then + echo "error: failed to get caller identity" + exit 1 +fi + +for CLUSTER in $@; do + echo "info: start to clean eks test cluster '$CLUSTER'" + clean_eks "$CLUSTER" + if [ $? -eq 0 ]; then + echo "info: succesfully cleaned the eks test cluster '$CLUSTER'" + else + echo "fatal: failed to clean the eks test cluster '$CLUSTER'" + exit 1 + fi +done diff --git a/ci/e2e_eks.groovy b/ci/e2e_eks.groovy new file mode 100644 index 0000000000..0c1a967ed5 --- /dev/null +++ b/ci/e2e_eks.groovy @@ -0,0 +1,137 @@ +// +// Jenkins pipeline for EKS e2e job. +// +// This script is written in declarative syntax. Refer to +// https://jenkins.io/doc/book/pipeline/syntax/ for more details. +// +// Note that parameters of the job is configured in this script. +// + +import groovy.transform.Field + +@Field +def podYAML = ''' +apiVersion: v1 +kind: Pod +spec: + containers: + - name: main + image: gcr.io/k8s-testimages/kubekins-e2e:v20200311-1e25827-master + command: + - runner.sh + - sleep + - 1d + # we need privileged mode in order to do docker in docker + securityContext: + privileged: true + env: + - name: DOCKER_IN_DOCKER_ENABLED + value: "true" + resources: + requests: + memory: "4000Mi" + cpu: 2000m + volumeMounts: + # dind expects /var/lib/docker to be volume + - name: docker-root + mountPath: /var/lib/docker + volumes: + - name: docker-root + emptyDir: {} +''' + +pipeline { + agent { + kubernetes { + yaml podYAML + defaultContainer "main" + customWorkspace "/home/jenkins/agent/workspace/go/src/github.com/pingcap/tidb-operator" + } + } + + options { + timeout(time: 3, unit: 'HOURS') + } + + parameters { + string(name: 'GIT_URL', defaultValue: 'git@github.com:pingcap/tidb-operator.git', description: 'git repo url') + string(name: 'GIT_REF', defaultValue: 'master', description: 'git ref spec to checkout, e.g. master, release-1.1') + string(name: 'PR_ID', defaultValue: '', description: 'pull request ID, this will override GIT_REF if set, e.g. 1889') + string(name: 'CLUSTER', defaultValue: 'jenkins-tidb-operator-e2e', description: 'the name of the cluster') + string(name: 'AWS_REGION', defaultValue: 'us-west-2', description: 'the AWS region') + string(name: 'GINKGO_NODES', defaultValue: '8', description: 'the number of ginkgo nodes') + } + + environment { + GIT_REF = '' + ARTIFACTS = "${env.WORKSPACE}/artifacts" + } + + stages { + stage("Prepare") { + steps { + // The declarative model for Jenkins Pipelines has a restricted + // subset of syntax that it allows in the stage blocks. We use + // script step to bypass the restriction. + // https://jenkins.io/doc/book/pipeline/syntax/#script + script { + GIT_REF = params.GIT_REF + if (params.PR_ID != "") { + GIT_REF = "refs/remotes/origin/pr/${params.PR_ID}/head" + } + } + echo "env.NODE_NAME: ${env.NODE_NAME}" + echo "env.WORKSPACE: ${env.WORKSPACE}" + echo "GIT_REF: ${GIT_REF}" + echo "ARTIFACTS: ${ARTIFACTS}" + } + } + + stage("Checkout") { + steps { + checkout scm: [ + $class: 'GitSCM', + branches: [[name: GIT_REF]], + userRemoteConfigs: [[ + credentialsId: 'github-sre-bot-ssh', + refspec: '+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*', + url: "${params.GIT_URL}", + ]] + ] + } + } + + stage("Run") { + steps { + withCredentials([ + string(credentialsId: 'TIDB_OPERATOR_AWS_ACCESS_KEY_ID', variable: 'AWS_ACCESS_KEY_ID'), + string(credentialsId: 'TIDB_OPERATOR_AWS_SECRET_ACCESS_KEY', variable: 'AWS_SECRET_ACCESS_KEY'), + ]) { + sh """ + #!/bin/bash + export PROVIDER=eks + export CLUSTER=${params.CLUSTER} + export AWS_REGION=${params.AWS_REGION} + export GINKGO_NODES=${params.GINKGO_NODES} + export REPORT_DIR=${ARTIFACTS} + echo "info: try to clean the cluster created previously" + ./ci/aws-clean-eks.sh \$CLUSTER + echo "info: begin to run e2e" + ./hack/e2e.sh -- --ginkgo.skip='\\[Serial\\]' --ginkgo.focus='\\[tidb-operator\\]' + """ + } + } + } + } + + post { + always { + dir(ARTIFACTS) { + archiveArtifacts artifacts: "**", allowEmptyArchive: true + junit testResults: "*.xml", allowEmptyResults: true + } + } + } +} + +// vim: et sw=4 ts=4 diff --git a/ci/e2e_gke.groovy b/ci/e2e_gke.groovy index 12fb0d429c..5575caae97 100644 --- a/ci/e2e_gke.groovy +++ b/ci/e2e_gke.groovy @@ -16,7 +16,7 @@ kind: Pod spec: containers: - name: main - image: gcr.io/k8s-testimages/kubekins-e2e:v20191108-9467d02-master + image: gcr.io/k8s-testimages/kubekins-e2e:v20200311-1e25827-master command: - runner.sh - sleep @@ -49,6 +49,10 @@ pipeline { } } + options { + timeout(time: 3, unit: 'HOURS') + } + parameters { string(name: 'GIT_URL', defaultValue: 'git@github.com:pingcap/tidb-operator.git', description: 'git repo url') string(name: 'GIT_REF', defaultValue: 'master', description: 'git ref spec to checkout, e.g. master, release-1.1') diff --git a/ci/pingcap_tidb_operator_build_kind.groovy b/ci/pingcap_tidb_operator_build_kind.groovy index 2edb302624..aaa97ed6ff 100644 --- a/ci/pingcap_tidb_operator_build_kind.groovy +++ b/ci/pingcap_tidb_operator_build_kind.groovy @@ -14,7 +14,7 @@ metadata: spec: containers: - name: main - image: gcr.io/k8s-testimages/kubekins-e2e:v20191108-9467d02-master + image: gcr.io/k8s-testimages/kubekins-e2e:v20200311-1e25827-master command: - runner.sh # Clean containers on TERM signal in root process to avoid cgroup leaking. diff --git a/hack/e2e.sh b/hack/e2e.sh index 0470bc22da..881102dac8 100755 --- a/hack/e2e.sh +++ b/hack/e2e.sh @@ -215,8 +215,9 @@ echo "GCP_PROJECT: $GCP_PROJECT" echo "GCP_CREDENTIALS: $GCP_CREDENTIALS" echo "GCP_REGION: $GCP_REGION" echo "GCP_ZONE: $GCP_ZONE" -echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID" -echo "AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY" +# We shouldn't print aws credential environments. +# echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID" +# echo "AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY" echo "AWS_REGION: $AWS_REGION" echo "KUBE_VERSION: $KUBE_VERSION" echo "KUBE_WORKERS: $KUBE_WORKERS" @@ -473,16 +474,12 @@ elif [ "$PROVIDER" == "eks" ]; then mngName=$CLUSTER-mng-$RANDOM export AWS_K8S_TESTER_EKS_NAME=$CLUSTER export AWS_K8S_TESTER_EKS_CONFIG_PATH=/tmp/kubetest2.eks.$CLUSTER - export AWS_K8S_TESTER_EKS_ADD_ON_NLB_HELLO_WORLD_ENABLE="false" + export AWS_K8S_TESTER_EKS_PARAMETERS_VERSION="1.15" + export AWS_K8S_TESTER_EKS_PARAMETERS_ENCRYPTION_CMK_CREATE="false" + export AWS_K8S_TESTER_EKS_ADD_ON_MANAGED_NODE_GROUPS_ENABLE="true" export AWS_K8S_TESTER_EKS_ADD_ON_MANAGED_NODE_GROUPS_MNGS=$(printf '{"%s":{"name":"%s","ami-type":"AL2_x86_64","asg-min-size":%d,"asg-max-size":%d,"asg-desired-capacity":%d,"instance-types":["c5.xlarge"],"volume-size":40}}' "$mngName" "$mngName" "$KUBE_WORKERS" "$KUBE_WORKERS" "$KUBE_WORKERS") # override KUBECONFIG KUBECONFIG=$AWS_K8S_TESTER_EKS_CONFIG_PATH.kubeconfig.yaml - if [ -z "$SKIP_UP" ]; then - # clear previous created private key to work around permission issue on this file - if test -f $HOME/.ssh/kube_aws_rsa; then - rm -f $HOME/.ssh/kube_aws_rsa - fi - fi else echo "error: unsupported provider '$PROVIDER'" exit 1 diff --git a/hack/lib.sh b/hack/lib.sh index 54f02e3131..4494a33861 100644 --- a/hack/lib.sh +++ b/hack/lib.sh @@ -36,7 +36,7 @@ KIND_VERSION=${KIND_VERSION:-0.7.0} KIND_BIN=$OUTPUT_BIN/kind KUBETEST2_VERSION=v0.0.8 KUBETSTS2_BIN=$OUTPUT_BIN/kubetest2 -AWS_K8S_TESTER_VERSION=v0.6.2 +AWS_K8S_TESTER_VERSION=v0.7.4 AWS_K8S_TESTER_BIN=$OUTPUT_BIN/aws-k8s-tester test -d "$OUTPUT_BIN" || mkdir -p "$OUTPUT_BIN" @@ -184,7 +184,7 @@ function hack::ensure_kubetest2() { function hack::verify_aws_k8s_tester() { if test -x $AWS_K8S_TESTER_BIN; then - [[ "$($AWS_K8S_TESTER_BIN version | awk '/ReleaseVersion/ {print $2}')" == "$AWS_K8S_TESTER_VERSION" ]] + [[ "$($AWS_K8S_TESTER_BIN version | jq '."release-version"' -r)" == "$AWS_K8S_TESTER_VERSION" ]] return fi return 1 diff --git a/hack/run-e2e.sh b/hack/run-e2e.sh index 5c180d0210..bc78ccf3ad 100755 --- a/hack/run-e2e.sh +++ b/hack/run-e2e.sh @@ -180,7 +180,7 @@ function e2e::__eks_instances() { } function e2e::__ecr_url() { - local account_id=$(aws sts get-caller-identity | awk '/Account/ { gsub("\x27", "", $2); print $2}') + local account_id=$(aws sts get-caller-identity --output text | awk '{print $1}') local region=$(aws configure get region) echo "${account_id}.dkr.ecr.${region}.amazonaws.com" } @@ -271,16 +271,17 @@ if [ "$PROVIDER" == "gke" ]; then fi gcloud container clusters get-credentials "$CLUSTER" elif [ "$PROVIDER" == "eks" ]; then - : + aws eks update-kubeconfig --name "$CLUSTER" fi if [ -z "$KUBECONTEXT" ]; then - echo "info: KUBECONTEXT is not set, current context $KUBECONTEXT is used" - KUBECONTEXT=$(kubectl config current-context 2>/dev/null) || true + echo "info: KUBECONTEXT is not set, current context is used" + KUBECONTEXT=$($KUBECTL_BIN config current-context 2>/dev/null) || true if [ -z "$KUBECONTEXT" ]; then - echo "error: KUBECONTEXT cannot be detected" + echo "error: current context cannot be detected" exit 1 fi + echo "info: current kubeconfig context is '$KUBECONTEXT'" fi e2e::image_load @@ -310,7 +311,6 @@ e2e_args=( ${ginkgo_args[@]:-} /usr/local/bin/e2e.test -- - --provider=${PROVIDER} --clean-start=true --delete-namespace-on-failure=false --repo-root=$ROOT @@ -324,13 +324,6 @@ e2e_args=( -v=4 ) -if [ -n "$REPORT_DIR" ]; then - e2e_args+=( - --report-dir="${REPORT_DIR}" - --report-prefix="${REPORT_PREFIX}" - ) -fi - e2e_args+=(${@:-}) docker_args=( @@ -347,23 +340,36 @@ docker_args=( ) if [ "$PROVIDER" == "eks" ]; then + e2e_args+=( + --provider=aws + --gce-zone ${AWS_REGION} + ) # aws credential is required to get token for EKS docker_args+=( -v $HOME/.aws:/root/.aws ) elif [ "$PROVIDER" == "gke" ]; then - e2e_args+=( - --gce-project ${GCP_PROJECT} - --gce-region ${GCP_REGION} - --gce-zone ${GCP_ZONE} - ) - docker_args+=( - -v ${GCP_CREDENTIALS}:${GCP_CREDENTIALS} - --env GOOGLE_APPLICATION_CREDENTIALS=${GCP_CREDENTIALS} - ) + e2e_args+=( + --provider=${PROVIDER} + --gce-project ${GCP_PROJECT} + --gce-region ${GCP_REGION} + --gce-zone ${GCP_ZONE} + ) + docker_args+=( + -v ${GCP_CREDENTIALS}:${GCP_CREDENTIALS} + --env GOOGLE_APPLICATION_CREDENTIALS=${GCP_CREDENTIALS} + ) +else + e2e_args+=( + --provider=${PROVIDER} + ) fi if [ -n "$REPORT_DIR" ]; then + e2e_args+=( + --report-dir="${REPORT_DIR}" + --report-prefix="${REPORT_PREFIX}" + ) docker_args+=( -v $REPORT_DIR:$REPORT_DIR ) diff --git a/hack/run-in-container.sh b/hack/run-in-container.sh index d99204bbb9..f821724200 100755 --- a/hack/run-in-container.sh +++ b/hack/run-in-container.sh @@ -139,5 +139,5 @@ docker run ${docker_args[@]} \ -v $ROOT:/go/src/github.com/pingcap/tidb-operator \ -w /go/src/github.com/pingcap/tidb-operator \ --entrypoint /usr/local/bin/runner.sh \ - gcr.io/k8s-testimages/kubekins-e2e:v20191108-9467d02-master \ + gcr.io/k8s-testimages/kubekins-e2e:v20200311-1e25827-master \ "${args[@]}" diff --git a/tests/e2e/e2e.go b/tests/e2e/e2e.go index 0c7f927d77..a7058225aa 100644 --- a/tests/e2e/e2e.go +++ b/tests/e2e/e2e.go @@ -122,12 +122,12 @@ func setupSuite() { e2elog.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err) } - // By using default storage class in GKE/EKS, network attached storage + // By using default storage class in GKE/EKS (aws), network attached storage // which be used and we must clean them later. // We set local-storage class as default for simplicity. // The default storage class of kind is local-path-provisioner which // consumes local storage like local-volume-provisioner. - if framework.TestContext.Provider == "gke" || framework.TestContext.Provider == "eks" { + if framework.TestContext.Provider == "gke" || framework.TestContext.Provider == "aws" { defaultSCName := "local-storage" list, err := c.StorageV1().StorageClasses().List(metav1.ListOptions{}) framework.ExpectNoError(err)