diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml new file mode 100644 index 00000000..b19c03b3 --- /dev/null +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -0,0 +1,227 @@ +name: Dataflow Engine Chaos + +on: + schedule: + - cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8 + pull_request: + branches: [ master ] + +# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. +concurrency: + group: ${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "base" + base: + # The type of runner that the job will run on + runs-on: ubuntu-18.04 + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + chaos-obj: + [ + "pod-failure-dfe", + "pod-kill-dfe", + ] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-go@v3 + with: + go-version: 1.18 + + - name: Cache go modules + uses: actions/cache@v2 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} + + # Set up Kubernetes with K3s + - name: Set up K3s cluster + run: | + curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.18.9+k3s1 sh -s - \ + --write-kubeconfig-mode=644 \ + "${k3s_disable_command:---disable}" metrics-server \ + "${k3s_disable_command:---disable}" traefik \ + --flannel-backend=none \ + --docker + shell: bash + + # this may be failed sometimes, and I want to exit the workflow directly if failed, + # but GitHub Actions doesnt' support early-exit yet, see https://github.com/actions/runner/issues/662. + # so, simply wait for a long time. + - name: Wait for coredns + run: | + kubectl rollout status --watch --timeout 600s deployment/coredns -n kube-system + shell: bash + env: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + + - name: Export KUBECONFIG environment variable + run: | + echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV + shell: bash + + - name: Print cluster information + run: | + kubectl config view + kubectl cluster-info + kubectl get nodes + kubectl get pods -n kube-system + kubectl get sc + kubectl version + + - name: Build dataflow engine binary + run: make df-master df-executor df-chaos-case + + - name: Build Dataflow engine docker image + run: | + cp -r $GITHUB_WORKSPACE/chaos/manifests/conf/ $GITHUB_WORKSPACE/bin/ + docker build -f $GITHUB_WORKSPACE/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin + docker image list + + # Set up metastore and basic services + - name: Set up metastore and basic services + run: | + kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml + kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml + kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml + - name: Wait for metastore ready + run: | + kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=60s || true + kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=60s || true + + echo show pvc + kubectl get pvc -l app=metastore -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=metastore -o wide + echo show sts + kubectl get sts -l app=metastore -o wide + echo show po + kubectl get po -l app=metastore -o wide + echo describe po + kubectl describe po -l app=metastore + echo describe pvc + kubectl describe pvc -l app=metastore + kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=0s + kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=0s + + - name: Set up server-master + run: | + kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml + kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml + kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml + + - name: Wait for server-master ready + run: | + kubectl wait --for=condition=Ready pod -l app=server-master --all --timeout=60s|| true + echo "<<<<< show pvc >>>>>" + kubectl get pvc -l app=server-master -o wide + echo "<<<<< show pv >>>>>" + kubectl get pv -o wide + echo "<<<<< show svc >>>>>" + kubectl get svc -l app=server-master -o wide + echo "<<<<< show sts >>>>>" + kubectl get sts -l app=server-master -o wide + echo "<<<<< show po >>>>>" + kubectl get po -l app=server-master -o wide + echo "<<<<< describe po >>>>>" + kubectl describe po -l app=server-master + echo "<<<<< describe pvc >>>>>" + kubectl describe pvc -l app=server-master + echo "<<<<< show current log for server-master-0 >>>>>" + kubectl logs server-master-0 || true + echo "<<<<< show previous log for server-master-0 >>>>>" + kubectl logs server-master-0 -p || true + echo "<<<<< show current log for server-master-1 >>>>>" + kubectl logs server-master-1 || true + echo "<<<<< show previous log for server-master-1 >>>>>" + kubectl logs server-master-1 -p || true + echo "<<<<< show current log for server-master-2 >>>>>" + kubectl logs server-master-2 || true + echo "<<<<< show previous log for server-master-2 >>>>>" + kubectl logs server-master-2 -p || true + + - name: Set up executor + run: | + kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml + kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml + kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml + + - name: Wait for executor ready + run: | + kubectl wait --for=condition=Ready pod -l app=executor --all --timeout=60s|| true + echo "<<<<< show pvc >>>>>" + kubectl get pvc -l app=executor -o wide + echo "<<<<< show pv >>>>>" + kubectl get pv -o wide + echo "<<<<< show svc >>>>>" + kubectl get svc -l app=executor -o wide + echo "<<<<< show sts >>>>>" + kubectl get sts -l app=executor -o wide + echo "<<<<< show po >>>>>" + kubectl get po -l app=executor -o wide + echo "<<<<< describe po >>>>>" + kubectl describe po -l app=executor + echo "<<<<< describe pvc >>>>>" + kubectl describe pvc -l app=executor + echo "<<<<< show current log for executor-0 >>>>>" + kubectl logs executor-0 || true + echo "<<<<< show previous log for executor-0 >>>>>" + kubectl logs executor-0 -p || true + echo "<<<<< show current log for executor-1 >>>>>" + kubectl logs executor-1 || true + echo "<<<<< show previous log for worker-master-1 >>>>>" + kubectl logs executor-1 -p || true + echo "<<<<< show current log for executor-2 >>>>>" + kubectl logs executor-2 || true + echo "<<<<< show previous log for executor-2 >>>>>" + kubectl logs executor-2 -p || true + + - name: Set up chaos test cases + run: | + kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml + kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml + kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml + + # - name: Encode chaos-mesh action + # run: | + # echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV + + # - name: Run chaos mesh action + # uses: chaos-mesh/chaos-mesh-action@master + # env: + # CFG_BASE64: ${{ env.CFG_BASE64 }} + # CHAOS_MESH_VERSION: v1.0.0 + + # check whether complete with 1m * 20 times. + - name: Wait for chaos test case complete + run: | + $GITHUB_WORKSPACE/chaos/scripts/check-case.sh + + - name: Copy logs to hack permission + if: ${{ always() }} + run: | + mkdir ./logs + sudo cp -r -L /var/log/containers/. ./logs + sudo find /var/ -type f -regex '.*/(server-master|executor).log$' | sudo xargs -i cp {} ./logs || true + sudo chown -R runner ./logs + + # Upload logs as artifact seems not stable, so we set `continue-on-error: true` here. + - name: Upload logs + continue-on-error: true + uses: actions/upload-artifact@v2 + if: ${{ always() }} + with: + name: chaos-base-logs.${{ matrix.chaos-obj }} + path: | + ./logs + !./logs/coredns-* + !./logs/local-path-provisioner-* diff --git a/Makefile b/Makefile index 66c4df40..ae510ba8 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ TEST_DIR := /tmp/dataflow_engine_test PARALLEL=3 GO := GO111MODULE=on go +GOBUILD := CGO_ENABLED=0 $(GO) build -trimpath GOTEST := CGO_ENABLED=1 go test -p $(PARALLEL) --race FAIL_ON_STDOUT := awk '{ print } END { if (NR > 0) { exit 1 } }' @@ -21,20 +22,23 @@ df-proto: ./generate-proto.sh df-master: - go build -o bin/master ./cmd/master + $(GOBUILD) -o bin/master ./cmd/master cp ./bin/master ./ansible/roles/common/files/master.bin df-executor: - go build -o bin/executor ./cmd/executor + $(GOBUILD) -o bin/executor ./cmd/executor cp ./bin/executor ./ansible/roles/common/files/executor.bin df-master-client: - go build -o bin/master-client ./cmd/master-client + $(GOBUILD) -o bin/master-client ./cmd/master-client df-demo: - go build -o bin/demoserver ./cmd/demoserver + $(GOBUILD) -o bin/demoserver ./cmd/demoserver cp ./bin/demoserver ./ansible/roles/common/files/demoserver.bin +df-chaos-case: + $(GOBUILD) -o bin/df-chaos-case ./chaos/cases + unit_test: check_failpoint_ctl mkdir -p "$(TEST_DIR)" $(FAILPOINT_ENABLE) diff --git a/chaos/cases/cases.go b/chaos/cases/cases.go new file mode 100644 index 00000000..b9ec39af --- /dev/null +++ b/chaos/cases/cases.go @@ -0,0 +1,30 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + + "github.com/pingcap/tiflow/dm/pkg/log" + "go.uber.org/zap" +) + +var cases = []string{"fake-job-normal", "fake-job-fast-finish"} + +func runCases(ctx context.Context) error { + for _, c := range cases { + log.L().Info("run case successfully", zap.String("case", c)) + } + return nil +} diff --git a/chaos/cases/config.go b/chaos/cases/config.go new file mode 100644 index 00000000..3e80d99f --- /dev/null +++ b/chaos/cases/config.go @@ -0,0 +1,50 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + "time" +) + +// config is used to run chaos tests. +type config struct { + *flag.FlagSet `toml:"-" yaml:"-" json:"-"` + + MasterAddr string `toml:"master-addr" yaml:"master-addr" json:"master-addr"` + Duration time.Duration `toml:"duration" yaml:"duration" json:"duration"` + + MasterCount int `toml:"master-count" yaml:"master-count" json:"master-count"` + WorkerCount int `toml:"worker-count" yaml:"worker-count" json:"worker-count"` +} + +// newConfig creates a config for this chaos testing suite. +func newConfig() *config { + cfg := &config{} + cfg.FlagSet = flag.NewFlagSet("chaos-case", flag.ContinueOnError) + fs := cfg.FlagSet + + fs.StringVar(&cfg.MasterAddr, "master-addr", "server-master:10240", "address of server-master") + fs.DurationVar(&cfg.Duration, "duration", 20*time.Minute, "duration of cases running") + + fs.IntVar(&cfg.MasterCount, "master-count", 3, "expect count of server-master") + fs.IntVar(&cfg.WorkerCount, "worker-count", 4, "expect count of executor") + + return cfg +} + +// parse parses flag definitions from the argument list. +func (c *config) parse(args []string) error { + return c.FlagSet.Parse(args) +} diff --git a/chaos/cases/main.go b/chaos/cases/main.go new file mode 100644 index 00000000..ab15da6d --- /dev/null +++ b/chaos/cases/main.go @@ -0,0 +1,91 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "flag" + "fmt" + "math/rand" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "github.com/pingcap/errors" + "github.com/pingcap/tiflow/dm/pkg/log" + "go.uber.org/zap" +) + +// main starts to run the test case logic after MySQL, TiDB and DM have been set up. +// NOTE: run this in the same K8s namespace as DM-master. +func main() { + code := 0 + defer func() { + os.Exit(code) + }() + + cfg := newConfig() + err := cfg.parse(os.Args[1:]) + switch errors.Cause(err) { + case nil: + case flag.ErrHelp: + return + default: + fmt.Println("parse cmd flags err:", err.Error()) + code = 2 + return + } + + err = log.InitLogger(&log.Config{ + File: "chaos-case.log", + Level: "info", + }) + if err != nil { + fmt.Println("init logger error:", err.Error()) + code = 2 + return + } + + go func() { + //nolint:errcheck + http.ListenAndServe("0.0.0.0:8899", nil) // for pprof + }() + + rand.Seed(time.Now().UnixNano()) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sc := make(chan os.Signal, 1) + signal.Notify(sc, + syscall.SIGHUP, + syscall.SIGINT, + syscall.SIGTERM, + syscall.SIGQUIT) + go func() { + sig := <-sc + log.L().Info("got signal to exit", zap.Stringer("signal", sig)) + cancel() + }() + + // run tests cases + err = runCases(ctx) + if err != nil { + log.L().Error("run cases failed", zap.Error(err)) + code = 2 + return + } +} diff --git a/chaos/manifests/Dockerfile b/chaos/manifests/Dockerfile new file mode 100644 index 00000000..7a78ed7e --- /dev/null +++ b/chaos/manifests/Dockerfile @@ -0,0 +1,12 @@ +FROM alpine:3.14 + +ADD master /df-server-master +ADD executor /df-executor +ADD df-chaos-case /df-chaos-case +ADD conf /conf + +RUN chmod a+x /df-server-master /df-executor /df-chaos-case + +WORKDIR / + +EXPOSE 10239 10240 10241 diff --git a/chaos/manifests/cases.yaml b/chaos/manifests/cases.yaml new file mode 100644 index 00000000..21316c5c --- /dev/null +++ b/chaos/manifests/cases.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: chaos-test-case +spec: + template: + spec: + containers: + - name: chaos-test-case + image: dataflow:chaos + imagePullPolicy: IfNotPresent + command: + - "/df-chaos-case" + - "--duration=20m" + restartPolicy: Never + backoffLimit: 0 # fail immediately diff --git a/chaos/manifests/conf/executor.toml b/chaos/manifests/conf/executor.toml new file mode 100644 index 00000000..395201d8 --- /dev/null +++ b/chaos/manifests/conf/executor.toml @@ -0,0 +1,3 @@ +keepalive-ttl = "20s" +keepalive-interval = "500ms" +session-ttl = 20 diff --git a/chaos/manifests/conf/server-master.toml b/chaos/manifests/conf/server-master.toml new file mode 100644 index 00000000..6981dd49 --- /dev/null +++ b/chaos/manifests/conf/server-master.toml @@ -0,0 +1,10 @@ +[etcd] +data-dir = "/data/etcd" + +[frame-metastore-conf] +endpoints = ["metastore-framework-mysql-0.metastore:3306"] +auth.user = "root" +auth.passwd = "" + +[user-metastore-conf] +endpoints = ["metastore-user-etcd-0.metastore:12479"] diff --git a/chaos/manifests/executor.yaml b/chaos/manifests/executor.yaml new file mode 100644 index 00000000..c444cdf6 --- /dev/null +++ b/chaos/manifests/executor.yaml @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: Service +metadata: + name: executor + labels: + app: executor +spec: + ports: + - name: executor + port: 10241 + targetPort: 10241 + selector: + app: executor +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: executor + labels: + app: executor +spec: + selector: + matchLabels: + app: executor + serviceName: executor + replicas: 4 + podManagementPolicy: Parallel + template: + metadata: + labels: + app: executor + spec: + containers: + - name: executor + image: dataflow:chaos # build this image in GitHub action workflow + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /log + name: executor-log + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 10241 + name: executor + command: + - "/df-executor" + - "--name=$(MY_POD_NAME)" + - "--worker-addr=0.0.0.0:10241" + - "--advertise-addr=$(MY_POD_NAME).executor.$(MY_POD_NAMESPACE):10241" + - "--join=server-master-0.server-master.$(MY_POD_NAMESPACE):10240,server-master-1.server-master.$(MY_POD_NAMESPACE):10240,server-master-2.server-master.$(MY_POD_NAMESPACE):10240" + - "--config=/conf/executor.toml" + readinessProbe: + httpGet: + port: 10241 + path: /metrics # TODO: use a real probe url + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 5 + volumeClaimTemplates: + - metadata: + name: executor-log + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/chaos/manifests/metastore.yaml b/chaos/manifests/metastore.yaml new file mode 100644 index 00000000..0306a1c1 --- /dev/null +++ b/chaos/manifests/metastore.yaml @@ -0,0 +1,106 @@ +apiVersion: v1 +kind: Service +metadata: + name: metastore + labels: + app: metastore +spec: + ports: + - name: port-mysql # note the name is no more than 15 characters + port: 3306 + targetPort: 3306 + - name: port-etcd # note the name is no more than 15 characters + port: 12479 + targetPort: 2479 + selector: + app: metastore + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: metastore-framework-mysql + labels: + app: metastore +spec: + selector: + matchLabels: + app: metastore + serviceName: metastore + replicas: 1 + podManagementPolicy: Parallel + template: + metadata: + labels: + app: metastore + spec: + containers: + - name: metastore-framework-mysql + image: mysql:5.7 + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: "/var/lib/mysql" + name: metastore-framework-mysql + env: + - name: MYSQL_ALLOW_EMPTY_PASSWORD + value: "true" + ports: + - containerPort: 3306 + name: port-mysql + args: + - "--server-id=1" + volumeClaimTemplates: + - metadata: + name: metastore-framework-mysql + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: metastore-user-etcd + labels: + app: metastore +spec: + selector: + matchLabels: + app: metastore + serviceName: metastore + replicas: 1 + podManagementPolicy: Parallel + template: + metadata: + labels: + app: metastore + spec: + containers: + - name: metastore-user-etcd + image: quay.io/coreos/etcd:v3.5.4 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 2479 + name: port-etcd + command: + - "etcd" + args: + - "--name=metastore-user-etcd" + - "--advertise-client-urls=http://0.0.0.0:2479" + - "--listen-client-urls=http://0.0.0.0:2479" + - "--listen-peer-urls=http://127.0.0.1:2480" + - "--initial-advertise-peer-urls=http://127.0.0.1:2480" + - "--initial-cluster=metastore-user-etcd=http://127.0.0.1:2480" + - "--initial-cluster-state=new" + volumeClaimTemplates: + - metadata: + name: metastore-user-etcd + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi diff --git a/chaos/manifests/server-master.yaml b/chaos/manifests/server-master.yaml new file mode 100644 index 00000000..fb7e5d3d --- /dev/null +++ b/chaos/manifests/server-master.yaml @@ -0,0 +1,84 @@ +apiVersion: v1 +kind: Service +metadata: + name: server-master + labels: + app: server-master +spec: + ports: + - name: port-master + port: 10240 + targetPort: 10240 + - name: port-mpeer + port: 10239 + targetPort: 10239 + selector: + app: server-master +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: server-master + labels: + app: server-master +spec: + selector: + matchLabels: + app: server-master + serviceName: server-master + replicas: 3 + podManagementPolicy: Parallel + template: + metadata: + labels: + app: server-master + spec: + containers: + - name: server-master + image: dataflow:chaos + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: server-master-data + - mountPath: /log + name: server-master-log + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 10240 + name: port-master + - containerPort: 10239 + name: port-mpeer + command: + - "/df-server-master" + - "--name=$(MY_POD_NAME)" + - "--master-addr=0.0.0.0:10240" + - "--advertise-addr=$(MY_POD_NAME).server-master.$(MY_POD_NAMESPACE):10240" + - "--peer-urls=0.0.0.0:10239" + - "--advertise-peer-urls=http://$(MY_POD_NAME).server-master.$(MY_POD_NAMESPACE):10239" + - "--initial-cluster=server-master-0=http://server-master-0.server-master.$(MY_POD_NAMESPACE):10239,server-master-1=http://server-master-1.server-master.$(MY_POD_NAMESPACE):10239,server-master-2=http://server-master-2.server-master.$(MY_POD_NAMESPACE):10239" + - "--config=/conf/server-master.toml" + volumeClaimTemplates: + - metadata: + name: server-master-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + - metadata: + name: server-master-log + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/chaos/scripts/check-case.sh b/chaos/scripts/check-case.sh new file mode 100755 index 00000000..d7cf7395 --- /dev/null +++ b/chaos/scripts/check-case.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +completed=false +for i in {1..20}; do + kubectl wait --for=condition=complete job/chaos-test-case --timeout=1m + if [ $? -eq 0 ]; then + completed=true + echo "chaos-test-case has completed" + break + else + echo "chaos-test-case has not completed" ${i} + kubectl get job chaos-test-case -o wide + if [ $? -ne 0 ]; then + echo "chaos-test-case job has been cleared" + break + fi + failed=$(kubectl get job chaos-test-case -o jsonpath={.status.failed}) + if [[ $failed -gt 0 ]]; then + echo "chaos-test-case job has failed" + kubectl logs job.batch/chaos-test-case + kubectl describe jobs/chaos-test-case + break + fi + fi +done + +if ! $completed; then + exit 1 +fi