From b4503f4b8f910d46e4c69fff8ea13e91812d8a75 Mon Sep 17 00:00:00 2001
From: Cian Hatton <cian@interchain.io>
Date: Mon, 6 Mar 2023 16:58:49 +0000
Subject: [PATCH] Uploading docker logs on test failures (#3236)

---
 .../e2e-compatibility-workflow-call.yaml      |   9 +
 .github/workflows/e2e-test-workflow-call.yml  |  14 ++
 .github/workflows/e2e.yaml                    |   1 +
 .gitignore                                    |   3 +-
 e2e/README.md                                 |   6 +
 e2e/dockerutil/dockerutil.go                  |  72 ++++++++
 e2e/testsuite/diagnostics/diagnostics.go      | 157 ++++++++++++++++++
 e2e/testsuite/testsuite.go                    |  16 +-
 8 files changed, 274 insertions(+), 4 deletions(-)
 create mode 100644 e2e/dockerutil/dockerutil.go
 create mode 100644 e2e/testsuite/diagnostics/diagnostics.go

diff --git a/.github/workflows/e2e-compatibility-workflow-call.yaml b/.github/workflows/e2e-compatibility-workflow-call.yaml
index 58c5642a4f4..fddf2f6ffdc 100644
--- a/.github/workflows/e2e-compatibility-workflow-call.yaml
+++ b/.github/workflows/e2e-compatibility-workflow-call.yaml
@@ -54,3 +54,12 @@ jobs:
           CHAIN_B_TAG: "${{ matrix.chain-b }}"
           CHAIN_BINARY: "${{ matrix.chain-binary }}"
           RELAYER_TYPE: "${{ matrix.relayer-type }}"
+      - name: Upload Diagnostics
+        uses: actions/upload-artifact@v3
+        # we only want to upload logs on test failures.
+        if: ${{ failure() }}
+        continue-on-error: true
+        with:
+          name: "${{ matrix.entrypoint }}-${{ matrix.test }}"
+          path: e2e/diagnostics
+          retention-days: 5
diff --git a/.github/workflows/e2e-test-workflow-call.yml b/.github/workflows/e2e-test-workflow-call.yml
index 3ed7f572c11..8f45c8c1aa2 100644
--- a/.github/workflows/e2e-test-workflow-call.yml
+++ b/.github/workflows/e2e-test-workflow-call.yml
@@ -61,6 +61,11 @@ on:
         required: false
         type: boolean
         default: false
+      upload-logs:
+        description: "Specify flag to indicate that logs should be uploaded on failure"
+        required: false
+        type: boolean
+        default: false
 
 env:
   REGISTRY: ghcr.io
@@ -160,6 +165,15 @@ jobs:
         with:
           go-version: 1.19
       - name: Run e2e Test
+        id: e2e_test
         run: |
           cd e2e
           make e2e-test entrypoint=${{ matrix.entrypoint }} test=${{ matrix.test }}
+      - name: Upload Diagnostics
+        uses: actions/upload-artifact@v3
+        if: ${{ failure() && inputs.upload-logs }}
+        continue-on-error: true
+        with:
+          name: "${{ matrix.entrypoint }}-${{ matrix.test }}"
+          path: e2e/diagnostics
+          retention-days: 5
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
index 9fe4b02cefd..585d4eca11d 100644
--- a/.github/workflows/e2e.yaml
+++ b/.github/workflows/e2e.yaml
@@ -62,6 +62,7 @@ jobs:
       secrets: inherit
       with:
         build-and-push-docker-image: true
+        upload-logs: true
         chain-image: ghcr.io/cosmos/ibc-go-simd
         chain-a-tag: "${{ needs.determine-image-tag.outputs.simd-tag }}"
         chain-b-tag: "${{ needs.determine-image-tag.outputs.simd-tag }}"
diff --git a/.gitignore b/.gitignore
index 1844a2347f6..b9c46cafa65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ mytestnet
 coverage.txt
 profile.out
 sim_log_file
+e2e/diagnostics
 
 # Vagrant
 .vagrant/
@@ -57,4 +58,4 @@ dependency-graph.png
 
 # Go
 go.work
-go.work.sum
\ No newline at end of file
+go.work.sum
diff --git a/e2e/README.md b/e2e/README.md
index 56463c219be..d6a03e4485c 100644
--- a/e2e/README.md
+++ b/e2e/README.md
@@ -354,6 +354,12 @@ json matrix files under .github/compatibility-test-matrices and is equivalent to
 
   This issue doesn't seem to occur on other operating systems.
 
+### Accessing Logs
+
+- When a test fails in GitHub. The logs of the test will be uploaded (viewable in the summary page of the workflow). Note: There 
+  may be some discrepancy in the logs collected and the output of interchain test. The containers may run for a some
+  time after the logs are collected, resulting in the displayed logs to differ slightly.
+
 ## Importable Workflow
 
 This repository contains an [importable workflow](https://github.com/cosmos/ibc-go/blob/bc963bcfd115a0e06b8196b114496db5ea011247/.github/workflows/e2e-compatibility-workflow-call.yaml) that can be used from any other repository to test chain upgrades. The workflow
diff --git a/e2e/dockerutil/dockerutil.go b/e2e/dockerutil/dockerutil.go
new file mode 100644
index 00000000000..aa7497645bf
--- /dev/null
+++ b/e2e/dockerutil/dockerutil.go
@@ -0,0 +1,72 @@
+package dockerutil
+
+import (
+	"archive/tar"
+	"context"
+	"fmt"
+	"io"
+	"path"
+	"testing"
+
+	dockertypes "github.com/docker/docker/api/types"
+	"github.com/docker/docker/api/types/filters"
+	dockerclient "github.com/docker/docker/client"
+)
+
+const testLabel = "ibc-test"
+
+// GetTestContainers returns all docker containers that have been created by interchain test.
+func GetTestContainers(t *testing.T, ctx context.Context, dc *dockerclient.Client) ([]dockertypes.Container, error) {
+	t.Helper()
+
+	testContainers, err := dc.ContainerList(ctx, dockertypes.ContainerListOptions{
+		All: true,
+		Filters: filters.NewArgs(
+			// see: https://github.com/strangelove-ventures/interchaintest/blob/0bdc194c2aa11aa32479f32b19e1c50304301981/internal/dockerutil/setup.go#L31-L36
+			// for the label needed to identify test containers.
+			filters.Arg("label", testLabel+"="+t.Name()),
+		),
+	})
+
+	if err != nil {
+		return nil, fmt.Errorf("failed listing containers: %s", err)
+	}
+
+	return testContainers, nil
+}
+
+// GetContainerLogs returns the logs of a container as a byte array.
+func GetContainerLogs(ctx context.Context, dc *dockerclient.Client, containerName string) ([]byte, error) {
+	readCloser, err := dc.ContainerLogs(ctx, containerName, dockertypes.ContainerLogsOptions{
+		ShowStdout: true,
+		ShowStderr: true,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed reading logs in test cleanup: %s", err)
+	}
+	return io.ReadAll(readCloser)
+}
+
+// GetFileContentsFromContainer reads the contents of a specific file from a container.
+func GetFileContentsFromContainer(ctx context.Context, dc *dockerclient.Client, containerID, absolutePath string) ([]byte, error) {
+	readCloser, _, err := dc.CopyFromContainer(ctx, containerID, absolutePath)
+	if err != nil {
+		return nil, fmt.Errorf("copying from container: %w", err)
+	}
+
+	defer readCloser.Close()
+
+	fileName := path.Base(absolutePath)
+	tr := tar.NewReader(readCloser)
+
+	hdr, err := tr.Next()
+	if err != nil {
+		return nil, err
+	}
+
+	if hdr.Name != fileName {
+		return nil, fmt.Errorf("expected to find %s but found %s", fileName, hdr.Name)
+	}
+
+	return io.ReadAll(tr)
+}
diff --git a/e2e/testsuite/diagnostics/diagnostics.go b/e2e/testsuite/diagnostics/diagnostics.go
new file mode 100644
index 00000000000..6226dd4a339
--- /dev/null
+++ b/e2e/testsuite/diagnostics/diagnostics.go
@@ -0,0 +1,157 @@
+package diagnostics
+
+import (
+	"context"
+	"fmt"
+	"os"
+	ospath "path"
+	"strings"
+	"testing"
+
+	dockertypes "github.com/docker/docker/api/types"
+	dockerclient "github.com/docker/docker/client"
+	"github.com/strangelove-ventures/interchaintest/v7/ibc"
+
+	"github.com/cosmos/ibc-go/e2e/dockerutil"
+	"github.com/cosmos/ibc-go/e2e/testconfig"
+)
+
+const (
+	e2eDir = "e2e"
+)
+
+// Collect can be used in `t.Cleanup` and will copy all the of the container logs and relevant files
+// into e2e/<test-suite>/<test-name>.log. These log files will be uploaded to GH upon test failure.
+func Collect(t *testing.T, dc *dockerclient.Client, cfg testconfig.ChainOptions) {
+	t.Helper()
+
+	if !t.Failed() {
+		t.Logf("test passed, not uploading logs")
+		return
+	}
+
+	t.Logf("writing logs for test: %s", t.Name())
+
+	ctx := context.TODO()
+	e2eDir, err := getE2EDir(t)
+	if err != nil {
+		t.Logf("failed finding log directory: %s", err)
+		return
+	}
+
+	logsDir := fmt.Sprintf("%s/diagnostics", e2eDir)
+
+	if err := os.MkdirAll(fmt.Sprintf("%s/%s", logsDir, t.Name()), 0750); err != nil {
+		t.Logf("failed creating logs directory in test cleanup: %s", err)
+		return
+	}
+
+	testContainers, err := dockerutil.GetTestContainers(t, ctx, dc)
+	if err != nil {
+		t.Logf("failed listing containers test cleanup: %s", err)
+		return
+	}
+
+	for _, container := range testContainers {
+		containerName := getContainerName(t, container)
+		containerDir := fmt.Sprintf("%s/%s/%s", logsDir, t.Name(), containerName)
+		if err := os.MkdirAll(containerDir, 0750); err != nil {
+			t.Logf("failed creating logs directory for container %s: %s", containerDir, err)
+			continue
+		}
+
+		logsBz, err := dockerutil.GetContainerLogs(ctx, dc, container.ID)
+		if err != nil {
+			t.Logf("failed reading logs in test cleanup: %s", err)
+			continue
+		}
+
+		logFile := fmt.Sprintf("%s/%s.log", containerDir, containerName)
+		if err := os.WriteFile(logFile, logsBz, 0750); err != nil {
+			t.Logf("failed writing log file for container %s in test cleanup: %s", containerName, err)
+			continue
+		}
+
+		t.Logf("successfully wrote log file %s", logFile)
+		diagnosticFiles := chainDiagnosticAbsoluteFilePaths(*cfg.ChainAConfig)
+		diagnosticFiles = append(diagnosticFiles, chainDiagnosticAbsoluteFilePaths(*cfg.ChainBConfig)...)
+
+		for _, absoluteFilePathInContainer := range diagnosticFiles {
+			localFilePath := ospath.Join(containerDir, ospath.Base(absoluteFilePathInContainer))
+			if err := fetchAndWriteDiagnosticsFile(ctx, dc, container.ID, localFilePath, absoluteFilePathInContainer); err != nil {
+				t.Logf("failed to fetch and write file %s for container %s in test cleanup: %s", absoluteFilePathInContainer, containerName, err)
+				continue
+			}
+			t.Logf("successfully wrote diagnostics file %s", absoluteFilePathInContainer)
+		}
+	}
+}
+
+// getContainerName returns a either the ID of the container or stripped down human-readable
+// version of the name if the name is non-empty.
+//
+// Note: You should still always use the ID  when interacting with the docker client.
+func getContainerName(t *testing.T, container dockertypes.Container) string {
+	t.Helper()
+	// container will always have an id, by may not have a name.
+	containerName := container.ID
+	if len(container.Names) > 0 {
+		containerName = container.Names[0]
+		// remove the test name from the container as the folder structure will provide this
+		// information already.
+		containerName = strings.TrimRight(containerName, "-"+t.Name())
+		containerName = strings.TrimLeft(containerName, "/")
+	}
+	return containerName
+}
+
+// fetchAndWriteDiagnosticsFile fetches the contents of a single file from the given container id and writes
+// the contents of the file to a local path provided.
+func fetchAndWriteDiagnosticsFile(ctx context.Context, dc *dockerclient.Client, containerID, localPath, absoluteFilePathInContainer string) error {
+	fileBz, err := dockerutil.GetFileContentsFromContainer(ctx, dc, containerID, absoluteFilePathInContainer)
+	if err != nil {
+		return err
+	}
+
+	filePath := ospath.Join(localPath)
+	if err := os.WriteFile(filePath, fileBz, 0750); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// chainDiagnosticAbsoluteFilePaths returns a slice of absolute file paths (in the containers) which are the files that should be
+// copied locally when fetching diagnostics.
+func chainDiagnosticAbsoluteFilePaths(cfg ibc.ChainConfig) []string {
+	return []string{
+		fmt.Sprintf("/var/cosmos-chain/%s/config/genesis.json", cfg.Name),
+		fmt.Sprintf("/var/cosmos-chain/%s/config/app.toml", cfg.Name),
+		fmt.Sprintf("/var/cosmos-chain/%s/config/config.toml", cfg.Name),
+		fmt.Sprintf("/var/cosmos-chain/%s/config/client.toml", cfg.Name),
+	}
+}
+
+// getE2EDir finds the e2e directory above the test.
+func getE2EDir(t *testing.T) (string, error) {
+	t.Helper()
+
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	const maxAttempts = 100
+	count := 0
+	for ; !strings.HasSuffix(wd, e2eDir) || count > maxAttempts; wd = ospath.Dir(wd) {
+		count++
+	}
+
+	// arbitrary value to avoid getting stuck in an infinite loop if this is called
+	// in a context where the e2e directory does not exist.
+	if count > maxAttempts {
+		return "", fmt.Errorf("unable to find e2e directory after %d tries", maxAttempts)
+	}
+
+	return wd, nil
+}
diff --git a/e2e/testsuite/testsuite.go b/e2e/testsuite/testsuite.go
index edd4642dcb5..965e4177c86 100644
--- a/e2e/testsuite/testsuite.go
+++ b/e2e/testsuite/testsuite.go
@@ -35,6 +35,7 @@ import (
 	"github.com/cosmos/ibc-go/e2e/relayer"
 	"github.com/cosmos/ibc-go/e2e/semverutil"
 	"github.com/cosmos/ibc-go/e2e/testconfig"
+	"github.com/cosmos/ibc-go/e2e/testsuite/diagnostics"
 	"github.com/cosmos/ibc-go/e2e/testvalues"
 	controllertypes "github.com/cosmos/ibc-go/v7/modules/apps/27-interchain-accounts/controller/types"
 	feetypes "github.com/cosmos/ibc-go/v7/modules/apps/29-fee/types"
@@ -442,17 +443,26 @@ func (s *E2ETestSuite) AssertPacketRelayed(ctx context.Context, chain *cosmos.Co
 // test and can be retrieved with GetChains.
 func (s *E2ETestSuite) createCosmosChains(chainOptions testconfig.ChainOptions) (*cosmos.CosmosChain, *cosmos.CosmosChain) {
 	client, network := interchaintest.DockerSetup(s.T())
+	t := s.T()
 
 	s.logger = zap.NewExample()
 	s.DockerClient = client
 	s.network = network
 
-	logger := zaptest.NewLogger(s.T())
+	logger := zaptest.NewLogger(t)
 
 	numValidators, numFullNodes := getValidatorsAndFullNodes()
 
-	chainA := cosmos.NewCosmosChain(s.T().Name(), *chainOptions.ChainAConfig, numValidators, numFullNodes, logger)
-	chainB := cosmos.NewCosmosChain(s.T().Name(), *chainOptions.ChainBConfig, numValidators, numFullNodes, logger)
+	chainA := cosmos.NewCosmosChain(t.Name(), *chainOptions.ChainAConfig, numValidators, numFullNodes, logger)
+	chainB := cosmos.NewCosmosChain(t.Name(), *chainOptions.ChainBConfig, numValidators, numFullNodes, logger)
+
+	// this is intentionally called after the interchaintest.DockerSetup function. The above function registers a
+	// cleanup task which deletes all containers. By registering a cleanup function afterwards, it is executed first
+	// this allows us to process the logs before the containers are removed.
+	t.Cleanup(func() {
+		diagnostics.Collect(t, s.DockerClient, chainOptions)
+	})
+
 	return chainA, chainB
 }