Skip to content

Commit

Permalink
ci(engine): add chaos-mesh test cases in github action, part-1 (#396)
Browse files Browse the repository at this point in the history
* ci(engine): add chaos-mesh test cases in github action
  • Loading branch information
amyangfei authored May 18, 2022
1 parent 88da691 commit fb4028a
Show file tree
Hide file tree
Showing 13 changed files with 741 additions and 4 deletions.
227 changes: 227 additions & 0 deletions .github/workflows/dataflow_engine_chaos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
name: Dataflow Engine Chaos

on:
schedule:
- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
pull_request:
branches: [ master ]

# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
concurrency:
group: ${{ github.ref }}-${{ github.workflow }}
cancel-in-progress: true

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# This workflow contains a single job called "base"
base:
# The type of runner that the job will run on
runs-on: ubuntu-18.04
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
chaos-obj:
[
"pod-failure-dfe",
"pod-kill-dfe",
]

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- uses: actions/checkout@v2

- uses: actions/setup-go@v3
with:
go-version: 1.18

- name: Cache go modules
uses: actions/cache@v2
with:
path: ~/go/pkg/mod
key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}

# Set up Kubernetes with K3s
- name: Set up K3s cluster
run: |
curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.18.9+k3s1 sh -s - \
--write-kubeconfig-mode=644 \
"${k3s_disable_command:---disable}" metrics-server \
"${k3s_disable_command:---disable}" traefik \
--flannel-backend=none \
--docker
shell: bash

# this may be failed sometimes, and I want to exit the workflow directly if failed,
# but GitHub Actions doesnt' support early-exit yet, see https://github.com/actions/runner/issues/662.
# so, simply wait for a long time.
- name: Wait for coredns
run: |
kubectl rollout status --watch --timeout 600s deployment/coredns -n kube-system
shell: bash
env:
KUBECONFIG: /etc/rancher/k3s/k3s.yaml

- name: Export KUBECONFIG environment variable
run: |
echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV
shell: bash

- name: Print cluster information
run: |
kubectl config view
kubectl cluster-info
kubectl get nodes
kubectl get pods -n kube-system
kubectl get sc
kubectl version
- name: Build dataflow engine binary
run: make df-master df-executor df-chaos-case

- name: Build Dataflow engine docker image
run: |
cp -r $GITHUB_WORKSPACE/chaos/manifests/conf/ $GITHUB_WORKSPACE/bin/
docker build -f $GITHUB_WORKSPACE/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
docker image list
# Set up metastore and basic services
- name: Set up metastore and basic services
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/metastore.yaml
- name: Wait for metastore ready
run: |
kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=60s || true
kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=60s || true
echo show pvc
kubectl get pvc -l app=metastore -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=metastore -o wide
echo show sts
kubectl get sts -l app=metastore -o wide
echo show po
kubectl get po -l app=metastore -o wide
echo describe po
kubectl describe po -l app=metastore
echo describe pvc
kubectl describe pvc -l app=metastore
kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=0s
kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=0s
- name: Set up server-master
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/server-master.yaml
- name: Wait for server-master ready
run: |
kubectl wait --for=condition=Ready pod -l app=server-master --all --timeout=60s|| true
echo "<<<<< show pvc >>>>>"
kubectl get pvc -l app=server-master -o wide
echo "<<<<< show pv >>>>>"
kubectl get pv -o wide
echo "<<<<< show svc >>>>>"
kubectl get svc -l app=server-master -o wide
echo "<<<<< show sts >>>>>"
kubectl get sts -l app=server-master -o wide
echo "<<<<< show po >>>>>"
kubectl get po -l app=server-master -o wide
echo "<<<<< describe po >>>>>"
kubectl describe po -l app=server-master
echo "<<<<< describe pvc >>>>>"
kubectl describe pvc -l app=server-master
echo "<<<<< show current log for server-master-0 >>>>>"
kubectl logs server-master-0 || true
echo "<<<<< show previous log for server-master-0 >>>>>"
kubectl logs server-master-0 -p || true
echo "<<<<< show current log for server-master-1 >>>>>"
kubectl logs server-master-1 || true
echo "<<<<< show previous log for server-master-1 >>>>>"
kubectl logs server-master-1 -p || true
echo "<<<<< show current log for server-master-2 >>>>>"
kubectl logs server-master-2 || true
echo "<<<<< show previous log for server-master-2 >>>>>"
kubectl logs server-master-2 -p || true
- name: Set up executor
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/executor.yaml
- name: Wait for executor ready
run: |
kubectl wait --for=condition=Ready pod -l app=executor --all --timeout=60s|| true
echo "<<<<< show pvc >>>>>"
kubectl get pvc -l app=executor -o wide
echo "<<<<< show pv >>>>>"
kubectl get pv -o wide
echo "<<<<< show svc >>>>>"
kubectl get svc -l app=executor -o wide
echo "<<<<< show sts >>>>>"
kubectl get sts -l app=executor -o wide
echo "<<<<< show po >>>>>"
kubectl get po -l app=executor -o wide
echo "<<<<< describe po >>>>>"
kubectl describe po -l app=executor
echo "<<<<< describe pvc >>>>>"
kubectl describe pvc -l app=executor
echo "<<<<< show current log for executor-0 >>>>>"
kubectl logs executor-0 || true
echo "<<<<< show previous log for executor-0 >>>>>"
kubectl logs executor-0 -p || true
echo "<<<<< show current log for executor-1 >>>>>"
kubectl logs executor-1 || true
echo "<<<<< show previous log for worker-master-1 >>>>>"
kubectl logs executor-1 -p || true
echo "<<<<< show current log for executor-2 >>>>>"
kubectl logs executor-2 || true
echo "<<<<< show previous log for executor-2 >>>>>"
kubectl logs executor-2 -p || true
- name: Set up chaos test cases
run: |
kubectl apply -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml
kubectl get -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml
kubectl describe -f $GITHUB_WORKSPACE/chaos/manifests/cases.yaml
# - name: Encode chaos-mesh action
# run: |
# echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV

# - name: Run chaos mesh action
# uses: chaos-mesh/chaos-mesh-action@master
# env:
# CFG_BASE64: ${{ env.CFG_BASE64 }}
# CHAOS_MESH_VERSION: v1.0.0

# check whether complete with 1m * 20 times.
- name: Wait for chaos test case complete
run: |
$GITHUB_WORKSPACE/chaos/scripts/check-case.sh
- name: Copy logs to hack permission
if: ${{ always() }}
run: |
mkdir ./logs
sudo cp -r -L /var/log/containers/. ./logs
sudo find /var/ -type f -regex '.*/(server-master|executor).log$' | sudo xargs -i cp {} ./logs || true
sudo chown -R runner ./logs
# Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
- name: Upload logs
continue-on-error: true
uses: actions/upload-artifact@v2
if: ${{ always() }}
with:
name: chaos-base-logs.${{ matrix.chaos-obj }}
path: |
./logs
!./logs/coredns-*
!./logs/local-path-provisioner-*
12 changes: 8 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
TEST_DIR := /tmp/dataflow_engine_test
PARALLEL=3
GO := GO111MODULE=on go
GOBUILD := CGO_ENABLED=0 $(GO) build -trimpath
GOTEST := CGO_ENABLED=1 go test -p $(PARALLEL) --race
FAIL_ON_STDOUT := awk '{ print } END { if (NR > 0) { exit 1 } }'

Expand All @@ -21,20 +22,23 @@ df-proto:
./generate-proto.sh

df-master:
go build -o bin/master ./cmd/master
$(GOBUILD) -o bin/master ./cmd/master
cp ./bin/master ./ansible/roles/common/files/master.bin

df-executor:
go build -o bin/executor ./cmd/executor
$(GOBUILD) -o bin/executor ./cmd/executor
cp ./bin/executor ./ansible/roles/common/files/executor.bin

df-master-client:
go build -o bin/master-client ./cmd/master-client
$(GOBUILD) -o bin/master-client ./cmd/master-client

df-demo:
go build -o bin/demoserver ./cmd/demoserver
$(GOBUILD) -o bin/demoserver ./cmd/demoserver
cp ./bin/demoserver ./ansible/roles/common/files/demoserver.bin

df-chaos-case:
$(GOBUILD) -o bin/df-chaos-case ./chaos/cases

unit_test: check_failpoint_ctl
mkdir -p "$(TEST_DIR)"
$(FAILPOINT_ENABLE)
Expand Down
30 changes: 30 additions & 0 deletions chaos/cases/cases.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2022 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"context"

"github.com/pingcap/tiflow/dm/pkg/log"
"go.uber.org/zap"
)

var cases = []string{"fake-job-normal", "fake-job-fast-finish"}

func runCases(ctx context.Context) error {
for _, c := range cases {
log.L().Info("run case successfully", zap.String("case", c))
}
return nil
}
50 changes: 50 additions & 0 deletions chaos/cases/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright 2020 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"flag"
"time"
)

// config is used to run chaos tests.
type config struct {
*flag.FlagSet `toml:"-" yaml:"-" json:"-"`

MasterAddr string `toml:"master-addr" yaml:"master-addr" json:"master-addr"`
Duration time.Duration `toml:"duration" yaml:"duration" json:"duration"`

MasterCount int `toml:"master-count" yaml:"master-count" json:"master-count"`
WorkerCount int `toml:"worker-count" yaml:"worker-count" json:"worker-count"`
}

// newConfig creates a config for this chaos testing suite.
func newConfig() *config {
cfg := &config{}
cfg.FlagSet = flag.NewFlagSet("chaos-case", flag.ContinueOnError)
fs := cfg.FlagSet

fs.StringVar(&cfg.MasterAddr, "master-addr", "server-master:10240", "address of server-master")
fs.DurationVar(&cfg.Duration, "duration", 20*time.Minute, "duration of cases running")

fs.IntVar(&cfg.MasterCount, "master-count", 3, "expect count of server-master")
fs.IntVar(&cfg.WorkerCount, "worker-count", 4, "expect count of executor")

return cfg
}

// parse parses flag definitions from the argument list.
func (c *config) parse(args []string) error {
return c.FlagSet.Parse(args)
}
Loading

0 comments on commit fb4028a

Please sign in to comment.