Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci(engine): add chaos-mesh test cases in github action, part-1 #396

Merged
merged 13 commits into from
May 18, 2022
227 changes: 227 additions & 0 deletions .github/workflows/dataflow_engine_chaos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
name: Dataflow Engine Chaos

on:
schedule:
- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
pull_request:
branches: [ master ]

# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
concurrency:
group: ${{ github.ref }}-${{ github.workflow }}
cancel-in-progress: true

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# This workflow contains a single job called "base"
base:
# The type of runner that the job will run on
runs-on: ubuntu-18.04
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
chaos-obj:
[
"pod-failure-dfe",
"pod-kill-dfe",
]

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- uses: actions/checkout@v2

- uses: actions/setup-go@v3
with:
go-version: 1.18

- name: Cache go modules
uses: actions/cache@v2
with:
path: ~/go/pkg/mod
key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}

# Set up Kubernetes with K3s
- name: Set up K3s cluster
run: |
curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.18.9+k3s1 sh -s - \
--write-kubeconfig-mode=644 \
"${k3s_disable_command:---disable}" metrics-server \
"${k3s_disable_command:---disable}" traefik \
--flannel-backend=none \
--docker
shell: bash

# this may be failed sometimes, and I want to exit the workflow directly if failed,
# but GitHub Actions doesnt' support early-exit yet, see https://github.com/actions/runner/issues/662.
# so, simply wait for a long time.
- name: Wait for coredns
run: |
kubectl rollout status --watch --timeout 600s deployment/coredns -n kube-system
shell: bash
env:
KUBECONFIG: /etc/rancher/k3s/k3s.yaml

- name: Export KUBECONFIG environment variable
run: |
echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV
shell: bash

- name: Print cluster information
run: |
kubectl config view
kubectl cluster-info
kubectl get nodes
kubectl get pods -n kube-system
kubectl get sc
kubectl version

- name: Build dataflow engine binary
run: make df-master df-executor df-chaos-case

- name: Build Dataflow engine docker image
run: |
cp -r $GITHUB_WORKSPACE/chaos/manifests/conf/ $GITHUB_WORKSPACE/bin/
docker build -f $GITHUB_WORKSPACE/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
docker image list

# Set up metastore and basic services
- name: Set up metastore and basic services
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml
- name: Wait for metastore ready
run: |
kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=60s || true
kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=60s || true

echo show pvc
kubectl get pvc -l app=metastore -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=metastore -o wide
echo show sts
kubectl get sts -l app=metastore -o wide
echo show po
kubectl get po -l app=metastore -o wide
echo describe po
kubectl describe po -l app=metastore
echo describe pvc
kubectl describe pvc -l app=metastore
kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=0s
kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=0s

- name: Set up server-master
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml

- name: Wait for server-master ready
run: |
kubectl wait --for=condition=Ready pod -l app=server-master --all --timeout=60s|| true
echo "<<<<< show pvc >>>>>"
kubectl get pvc -l app=server-master -o wide
echo "<<<<< show pv >>>>>"
kubectl get pv -o wide
echo "<<<<< show svc >>>>>"
kubectl get svc -l app=server-master -o wide
echo "<<<<< show sts >>>>>"
kubectl get sts -l app=server-master -o wide
echo "<<<<< show po >>>>>"
kubectl get po -l app=server-master -o wide
echo "<<<<< describe po >>>>>"
kubectl describe po -l app=server-master
echo "<<<<< describe pvc >>>>>"
kubectl describe pvc -l app=server-master
echo "<<<<< show current log for server-master-0 >>>>>"
kubectl logs server-master-0 || true
echo "<<<<< show previous log for server-master-0 >>>>>"
kubectl logs server-master-0 -p || true
echo "<<<<< show current log for server-master-1 >>>>>"
kubectl logs server-master-1 || true
echo "<<<<< show previous log for server-master-1 >>>>>"
kubectl logs server-master-1 -p || true
echo "<<<<< show current log for server-master-2 >>>>>"
kubectl logs server-master-2 || true
echo "<<<<< show previous log for server-master-2 >>>>>"
kubectl logs server-master-2 -p || true

- name: Set up executor
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml

- name: Wait for executor ready
run: |
kubectl wait --for=condition=Ready pod -l app=executor --all --timeout=60s|| true
echo "<<<<< show pvc >>>>>"
kubectl get pvc -l app=executor -o wide
echo "<<<<< show pv >>>>>"
kubectl get pv -o wide
echo "<<<<< show svc >>>>>"
kubectl get svc -l app=executor -o wide
echo "<<<<< show sts >>>>>"
kubectl get sts -l app=executor -o wide
echo "<<<<< show po >>>>>"
kubectl get po -l app=executor -o wide
echo "<<<<< describe po >>>>>"
kubectl describe po -l app=executor
echo "<<<<< describe pvc >>>>>"
kubectl describe pvc -l app=executor
echo "<<<<< show current log for executor-0 >>>>>"
kubectl logs executor-0 || true
echo "<<<<< show previous log for executor-0 >>>>>"
kubectl logs executor-0 -p || true
echo "<<<<< show current log for executor-1 >>>>>"
kubectl logs executor-1 || true
echo "<<<<< show previous log for worker-master-1 >>>>>"
kubectl logs executor-1 -p || true
echo "<<<<< show current log for executor-2 >>>>>"
kubectl logs executor-2 || true
echo "<<<<< show previous log for executor-2 >>>>>"
kubectl logs executor-2 -p || true

- name: Set up chaos test cases
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml

# - name: Encode chaos-mesh action
# run: |
# echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV

# - name: Run chaos mesh action
# uses: chaos-mesh/chaos-mesh-action@master
# env:
# CFG_BASE64: ${{ env.CFG_BASE64 }}
# CHAOS_MESH_VERSION: v1.0.0

# check whether complete with 1m * 20 times.
- name: Wait for chaos test case complete
run: |
$GITHUB_WORKSPACE/chaos/scripts/check-case.sh

- name: Copy logs to hack permission
if: ${{ always() }}
run: |
mkdir ./logs
sudo cp -r -L /var/log/containers/. ./logs
sudo find /var/ -type f -regex '.*/(server-master|executor).log$' | sudo xargs -i cp {} ./logs || true
sudo chown -R runner ./logs

# Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
- name: Upload logs
continue-on-error: true
uses: actions/upload-artifact@v2
if: ${{ always() }}
with:
name: chaos-base-logs.${{ matrix.chaos-obj }}
path: |
./logs
!./logs/coredns-*
!./logs/local-path-provisioner-*
12 changes: 8 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
TEST_DIR := /tmp/dataflow_engine_test
PARALLEL=3
GO := GO111MODULE=on go
GOBUILD := CGO_ENABLED=0 $(GO) build -trimpath
GOTEST := CGO_ENABLED=1 go test -p $(PARALLEL) --race
FAIL_ON_STDOUT := awk '{ print } END { if (NR > 0) { exit 1 } }'

Expand All @@ -21,20 +22,23 @@ df-proto:
./generate-proto.sh

df-master:
go build -o bin/master ./cmd/master
$(GOBUILD) -o bin/master ./cmd/master
cp ./bin/master ./ansible/roles/common/files/master.bin

df-executor:
go build -o bin/executor ./cmd/executor
$(GOBUILD) -o bin/executor ./cmd/executor
cp ./bin/executor ./ansible/roles/common/files/executor.bin

df-master-client:
go build -o bin/master-client ./cmd/master-client
$(GOBUILD) -o bin/master-client ./cmd/master-client

df-demo:
go build -o bin/demoserver ./cmd/demoserver
$(GOBUILD) -o bin/demoserver ./cmd/demoserver
cp ./bin/demoserver ./ansible/roles/common/files/demoserver.bin

df-chaos-case:
$(GOBUILD) -o bin/df-chaos-case ./chaos/cases

unit_test: check_failpoint_ctl
mkdir -p "$(TEST_DIR)"
$(FAILPOINT_ENABLE)
Expand Down
30 changes: 30 additions & 0 deletions chaos/cases/cases.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2022 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"context"

"github.com/pingcap/tiflow/dm/pkg/log"
"go.uber.org/zap"
)

var cases = []string{"fake-job-normal", "fake-job-fast-finish"}

func runCases(ctx context.Context) error {
for _, c := range cases {
log.L().Info("run case successfully", zap.String("case", c))
}
return nil
}
50 changes: 50 additions & 0 deletions chaos/cases/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright 2020 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"flag"
"time"
)

// config is used to run chaos tests.
type config struct {
*flag.FlagSet `toml:"-" yaml:"-" json:"-"`

MasterAddr string `toml:"master-addr" yaml:"master-addr" json:"master-addr"`
Duration time.Duration `toml:"duration" yaml:"duration" json:"duration"`

MasterCount int `toml:"master-count" yaml:"master-count" json:"master-count"`
WorkerCount int `toml:"worker-count" yaml:"worker-count" json:"worker-count"`
}

// newConfig creates a config for this chaos testing suite.
func newConfig() *config {
cfg := &config{}
cfg.FlagSet = flag.NewFlagSet("chaos-case", flag.ContinueOnError)
fs := cfg.FlagSet

fs.StringVar(&cfg.MasterAddr, "master-addr", "server-master:10240", "address of server-master")
fs.DurationVar(&cfg.Duration, "duration", 20*time.Minute, "duration of cases running")

fs.IntVar(&cfg.MasterCount, "master-count", 3, "expect count of server-master")
fs.IntVar(&cfg.WorkerCount, "worker-count", 4, "expect count of executor")

return cfg
}

// parse parses flag definitions from the argument list.
func (c *config) parse(args []string) error {
return c.FlagSet.Parse(args)
}
Loading