Skip to content

Commit

Permalink
Stability: Deflake Kube2e Tests (#8429)
Browse files Browse the repository at this point in the history
* cleanup default proxy in DeleteSnapshot

* cleanup dynamic proxies in deleteSnapshot

* update readme

* add changelog

* goimports -w .

* use proxy.list to identify dynamically generated proxies

* add flake attempts decorator to test that failed

* support proxy writing,but warn

* add ref to example in readme

* allow customizing writeNamespace

* respond to pr feedback
  • Loading branch information
sam-heilbron authored Jun 30, 2023
1 parent 9202582 commit a917c82
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 28 deletions.
7 changes: 7 additions & 0 deletions changelog/v1.15.0-beta16/kube2e-gw-deflake.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
changelog:
- type: NON_USER_FACING
issueLink: https://github.com/solo-io/gloo/issues/7795
resolvesIssue: false
description: >-
Delete Proxies when cleaning up the Snapshot between tests. This ensures that each tests starts
with a clean state.
52 changes: 38 additions & 14 deletions test/helpers/snapshot_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,34 @@ package helpers
import (
"time"

"github.com/onsi/ginkgo/v2"

"github.com/solo-io/gloo/projects/gloo/pkg/defaults"

"github.com/avast/retry-go"
"github.com/solo-io/gloo/projects/gloo/pkg/api/v1/gloosnapshot"
"github.com/solo-io/solo-kit/pkg/api/v1/clients"
"github.com/solo-io/solo-kit/pkg/errors"
)

var _ SnapshotWriter = new(snapshotWriterImpl)
var _ SnapshotWriter = new(SnapshotWriterImpl)

type SnapshotWriter interface {
WriteSnapshot(snapshot *gloosnapshot.ApiSnapshot, writeOptions clients.WriteOpts) error
DeleteSnapshot(snapshot *gloosnapshot.ApiSnapshot, deleteOptions clients.DeleteOpts) error
}

type snapshotWriterImpl struct {
type SnapshotWriterImpl struct {
ResourceClientSet
retryOptions []retry.Option

// writeNamespace is the namespace that the SnapshotWriter expects resources to be written to by Gloo
// This is controlled by the settings.WriteNamespace option
// This field is used by DeleteSnapshot to delete all Proxy resources in the namespace
writeNamespace string
}

func NewSnapshotWriter(clientSet ResourceClientSet, retryOptions []retry.Option) *snapshotWriterImpl {
func NewSnapshotWriter(clientSet ResourceClientSet, retryOptions []retry.Option) *SnapshotWriterImpl {
defaultRetryOptions := []retry.Option{
retry.Attempts(3),
retry.RetryIf(func(err error) bool {
Expand All @@ -32,14 +41,24 @@ func NewSnapshotWriter(clientSet ResourceClientSet, retryOptions []retry.Option)
retry.DelayType(retry.BackOffDelay),
}

return &snapshotWriterImpl{
return &SnapshotWriterImpl{
ResourceClientSet: clientSet,
retryOptions: append(defaultRetryOptions, retryOptions...),
// By default, Gloo will write resources to the gloo-system namespace
// This can be overridden by setting the WithNamespace option on the SnapshotWriter
writeNamespace: defaults.GlooSystem,
}
}

// WithWriteNamespace sets the namespace that the SnapshotWriter expects resources to be written to
// This is used when Proxies are deleted, by listing all Proxies in this namespace and removing them
func (s *SnapshotWriterImpl) WithWriteNamespace(writeNamespace string) *SnapshotWriterImpl {
s.writeNamespace = writeNamespace
return s
}

// WriteSnapshot writes all resources in the ApiSnapshot to the cache
func (s snapshotWriterImpl) WriteSnapshot(snapshot *gloosnapshot.ApiSnapshot, writeOptions clients.WriteOpts) error {
func (s *SnapshotWriterImpl) WriteSnapshot(snapshot *gloosnapshot.ApiSnapshot, writeOptions clients.WriteOpts) error {
return retry.Do(func() error {
if writeOptions.Ctx.Err() != nil {
// intentionally return early if context is already done
Expand All @@ -51,7 +70,7 @@ func (s snapshotWriterImpl) WriteSnapshot(snapshot *gloosnapshot.ApiSnapshot, wr
}

// WriteSnapshot writes all resources in the ApiSnapshot to the cache
func (s snapshotWriterImpl) doWriteSnapshot(snapshot *gloosnapshot.ApiSnapshot, writeOptions clients.WriteOpts) error {
func (s *SnapshotWriterImpl) doWriteSnapshot(snapshot *gloosnapshot.ApiSnapshot, writeOptions clients.WriteOpts) error {
// We intentionally create child resources first to avoid having the validation webhook reject
// the parent resource

Expand Down Expand Up @@ -120,16 +139,14 @@ func (s snapshotWriterImpl) doWriteSnapshot(snapshot *gloosnapshot.ApiSnapshot,
return writeErr
}
}
for _, proxy := range snapshot.Proxies {
if _, writeErr := s.ProxyClient().Write(proxy, writeOptions); !s.isContinuableWriteError(writeErr) {
return writeErr
}
if len(snapshot.Proxies) > 0 {
// It is recommended to configure Gateway resources (GW, VS, RT, etc) instead of Proxy resources
ginkgo.Fail("Proxies are intended to be an opaque resources to users and are not recommended to be written directly in tests")
}

return nil
}

func (s snapshotWriterImpl) isContinuableWriteError(writeError error) bool {
func (s *SnapshotWriterImpl) isContinuableWriteError(writeError error) bool {
if writeError == nil {
return true
}
Expand All @@ -140,7 +157,7 @@ func (s snapshotWriterImpl) isContinuableWriteError(writeError error) bool {
}

// DeleteSnapshot deletes all resources in the ApiSnapshot from the cache
func (s snapshotWriterImpl) DeleteSnapshot(snapshot *gloosnapshot.ApiSnapshot, deleteOptions clients.DeleteOpts) error {
func (s *SnapshotWriterImpl) DeleteSnapshot(snapshot *gloosnapshot.ApiSnapshot, deleteOptions clients.DeleteOpts) error {
// We intentionally delete resources in the reverse order that we create resources
// If we delete child resources first, the validation webhook may reject the change

Expand Down Expand Up @@ -225,7 +242,14 @@ func (s snapshotWriterImpl) DeleteSnapshot(snapshot *gloosnapshot.ApiSnapshot, d

// Proxies are auto generated by Gateway resources
// Therefore we delete Proxies after we have deleted the resources that may regenerate a Proxy
for _, proxy := range snapshot.Proxies {
proxies, err := s.ProxyClient().List(s.writeNamespace, clients.ListOpts{
Ctx: deleteOptions.Ctx,
Cluster: deleteOptions.Cluster,
})
if err != nil {
return err
}
for _, proxy := range proxies {
proxyNamespace, proxyName := proxy.GetMetadata().Ref().Strings()
if deleteErr := s.ProxyClient().Delete(proxyNamespace, proxyName, deleteOptions); deleteErr != nil {
return deleteErr
Expand Down
11 changes: 9 additions & 2 deletions test/kube2e/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@ The `GetTestHelper` util method handles installing gloo from either a local or r
The list of tests to run during CI and nightly builds is provided in `kube-e2e-test-type` matrices in the github workflows. A new test can be added to one or both lists of tests.
## Local Development

### Setup
### Setup (Previously Released Assets)
It is possible to run these tests against a previously released version of Gloo Edge. This is useful for testing a release candidate, or a nightly build.

There is no setup required for this option, as the test suite will download the helm chart archive and `glooctl` binary from the specified release. You will use the `RELEASED_VERSION` environment variable when running the tests. See the [variable definition](/test/testutils/env.go) for more details.

### Setup (Locally Build Assets)

For these tests to run, we require the following conditions:
- Gloo Edge Helm chart archive be present in the `_test` folder,
- `glooctl` be built in the`_output` folder
Expand All @@ -49,7 +55,7 @@ Example:
CLUSTER_NAME=solo-test-cluster CLUSTER_NODE_VERSION=v1.25.3 VERSION=v1.0.0-solo-test ci/kind/setup-kind.sh
```

### Verify Your Setup
#### Verify Your Setup
Before running your tests, it's worthwhile to verify that a cluster was created, and the proper images have been loaded.

```bash
Expand Down Expand Up @@ -84,6 +90,7 @@ The below table contains the environment variables that can be used to configure
| WAIT_ON_FAIL | 0 | Set to 1 to prevent Ginkgo from cleaning up the Gloo Edge installation in case of failure. Useful to exec into inspect resources created by the test. A command to resume the test run (and thus clean up resources) will be logged to the output. |
| TEAR_DOWN | false | Set to true to uninstall Gloo after the test suite completes |
| RELEASED_VERSION | '' | Used by nightlies to tests a specific released version. 'LATEST' will find the latest release |

#### Common Test Errors
`getting Helm chart version: expected a single entry with name [gloo], found: 5`\
The test helm charts are written to the `_test` directory, with the `index.yaml` file containing references to all available charts. The tests require that this file contain only 1 entry. Delete the other entries manually, or run `make clean` to delete this folder entirely, and then re-build the test helm chart.
2 changes: 1 addition & 1 deletion test/kube2e/gateway/gateway_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func StartTestHelper() {
resourceClientset, err = kube2e.NewDefaultKubeResourceClientSet(ctx)
Expect(err).NotTo(HaveOccurred(), "can create kube resource client set")

snapshotWriter = helpers.NewSnapshotWriter(resourceClientset, []retry.Option{})
snapshotWriter = helpers.NewSnapshotWriter(resourceClientset, []retry.Option{}).WithWriteNamespace(testHelper.InstallNamespace)
}

func TearDownTestHelper() {
Expand Down
4 changes: 3 additions & 1 deletion test/kube2e/gateway/gateway_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2232,7 +2232,9 @@ spec:
kube2e.UpdateAlwaysAcceptSetting(ctx, false, testHelper.InstallNamespace)
})

Context("with a mix of valid and invalid virtual services", func() {
Context("with a mix of valid and invalid virtual services", FlakeAttempts(3), func() {
// We have resolved most of the flakiness in this test, but it still occasionally fails
// We have not been able to reproduce the failure locally, so we are marking it as flaky

var (
validVsName, invalidVsName string
Expand Down
2 changes: 1 addition & 1 deletion test/kube2e/gloo/gloo_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ var _ = BeforeSuite(func() {
resourceClientset, err = kube2e.NewDefaultKubeResourceClientSet(ctx)
Expect(err).NotTo(HaveOccurred(), "can create kube resource client set")

snapshotWriter = helpers.NewSnapshotWriter(resourceClientset, []retry.Option{})
snapshotWriter = helpers.NewSnapshotWriter(resourceClientset, []retry.Option{}).WithWriteNamespace(testHelper.InstallNamespace)

envoyFactory = envoy.NewFactory()

Expand Down
2 changes: 1 addition & 1 deletion test/kube2e/gloomtls/gloo_mtls_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func StartTestHelper() {
resourceClientset, err = kube2e.NewDefaultKubeResourceClientSet(ctx)
Expect(err).NotTo(HaveOccurred(), "can create kube resource client set")

snapshotWriter = helpers.NewSnapshotWriter(resourceClientset, []retry.Option{})
snapshotWriter = helpers.NewSnapshotWriter(resourceClientset, []retry.Option{}).WithWriteNamespace(testHelper.InstallNamespace)
}

func TearDownTestHelper() {
Expand Down
25 changes: 17 additions & 8 deletions test/kube2e/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"runtime"
"time"

"github.com/solo-io/gloo/test/testutils"

"github.com/onsi/ginkgo/v2"
"github.com/solo-io/go-utils/stats"

Expand Down Expand Up @@ -174,15 +176,22 @@ func GetSimpleTestRunnerHttpResponse() string {
// For nightly runs, we want to install a released version rather than using a locally built chart
// To do this, set the environment variable RELEASED_VERSION with either a version name or "LATEST" to get the last release
func GetTestReleasedVersion(ctx context.Context, repoName string) string {
var useVersion string
if useVersion = os.Getenv("RELEASED_VERSION"); useVersion != "" {
if useVersion == "LATEST" {
_, current, err := upgrade.GetUpgradeVersions(ctx, repoName)
Expect(err).NotTo(HaveOccurred())
useVersion = current.String()
}
releasedVersion := os.Getenv(testutils.ReleasedVersion)

if releasedVersion == "" {
// In the case where the released version is empty, we return an empty string
// The function which consumes this value will then use the locally built chart
return releasedVersion
}
return useVersion

if releasedVersion == "LATEST" {
_, current, err := upgrade.GetUpgradeVersions(ctx, repoName)
Expect(err).NotTo(HaveOccurred())
return current.String()
}

// Assume that releasedVersion is a valid version, for a previously released version of Gloo Edge
return releasedVersion
}
func GetTestHelper(ctx context.Context, namespace string) (*helper.SoloTestHelper, error) {
cwd, err := os.Getwd()
Expand Down
6 changes: 6 additions & 0 deletions test/testutils/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ const (

// ServiceLogLevel is used to set the log level for the test services. See services/logging.go for more details
ServiceLogLevel = "SERVICE_LOG_LEVEL"

// ReleasedVersion can be used when running KubeE2E tests to have the test suite use a previously released version of Gloo Edge
// If set to 'LATEST', the most recently released version will be used
// If set to another value, the test suite will use that version (ie '1.15.0-beta1')
// This is an optional value, so if it is not set, the test suite will use the locally built version of Gloo Edge
ReleasedVersion = "RELEASED_VERSION"
)

// ShouldTearDown returns true if any assets that were created before a test (for example Gloo being installed)
Expand Down

0 comments on commit a917c82

Please sign in to comment.