Add debug info collection to functional tests ghwf (#1584)

* Add debug info collection to functional tests ghwf so when functional tests fail the state of the cluster is recorded and uploaded as a zip file artifact
signalfx · Dec 18, 2024 · b49ef63 · b49ef63
1 parent 7419b32
commit b49ef63
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 26 deletions.
diff --git a/.github/workflows/functional_test_v2.yaml b/.github/workflows/functional_test_v2.yaml
@@ -6,8 +6,12 @@ on:
     branches: [ main ]
   workflow_dispatch:
     inputs:
-      UPDATE_EXPECTED_RESULTS:
-        description: 'Set this to true to update expected results and collect updated test output as a Github workflow artifact.'
+      UPLOAD_UPDATED_EXPECTED_RESULTS:
+        description: 'Set this to true to upload updated golden file expected results and upload these results as a Github workflow artifact.'
+        required: false
+        default: false
+      UPLOAD_KUBERNETES_DEBUG_INFO:
+        description: 'Set this to true to collect the debug info of the k8s cluster and upload this info as a Github workflow artifact.'
         required: false
         default: false
 
@@ -20,7 +24,8 @@ jobs:
     env:
       KUBECONFIG: /tmp/kube-config-splunk-otel-collector-chart-functional-testing
       KUBE_TEST_ENV: kind
-      UPDATE_EXPECTED_RESULTS: ${{ github.event.inputs.UPDATE_EXPECTED_RESULTS || 'false' }}
+      UPLOAD_UPDATED_EXPECTED_RESULTS: ${{ github.event.inputs.UPLOAD_UPDATED_EXPECTED_RESULTS || 'false' }}
+      UPLOAD_KUBERNETES_DEBUG_INFO: ${{ github.event.inputs.UPLOAD_KUBERNETES_DEBUG_INFO || 'false' }}
     strategy:
       fail-fast: false
       matrix:
@@ -71,13 +76,28 @@ jobs:
         run: |
           make cert-manager
       - name: run functional tests
+        id: run-functional-tests
         env:
           K8S_VERSION: ${{ matrix.k8s-version }}
         run: |
           cd functional_tests
-          TEARDOWN_BEFORE_SETUP=true UPDATE_EXPECTED_RESULTS=${{ env.UPDATE_EXPECTED_RESULTS }} go test -v -tags ${{ matrix.test-job }}
-      - name: 'Upload test results'
-        if: always() && env.UPDATE_EXPECTED_RESULTS == 'true'
+          TEARDOWN_BEFORE_SETUP=true UPDATE_EXPECTED_RESULTS=${{ env.UPLOAD_UPDATED_EXPECTED_RESULTS }} go test -v -tags ${{ matrix.test-job }}
+      - name: Collect Kubernetes Cluster debug info on failure
+        if: always() && (steps.run-functional-tests.outcome == 'failure' || env.UPLOAD_KUBERNETES_DEBUG_INFO == 'true')
+        id: collect-debug-info
+        run: |
+          echo "Functional tests failed. Collecting debug info for current state of the Kubernetes cluster..."
+          cd tools
+          ./splunk_kubernetes_debug_info.sh
+      - name: Upload Kubernetes Cluster debug info
+        if: always() && (steps.run-functional-tests.outcome == 'failure' || env.UPLOAD_KUBERNETES_DEBUG_INFO == 'true')
+        uses: actions/upload-artifact@v4
+        with:
+          name: k8s-debug-info-${{ matrix.test-job }}-${{ matrix.k8s-version }}
+          path: tools/splunk_kubernetes_debug_info_*
+          retention-days: 5
+      - name: Upload test results
+        if: always() && env.UPLOAD_UPDATED_EXPECTED_RESULTS == 'true'
         uses: actions/upload-artifact@v4
         with:
           name: functional_tests-${{ matrix.test-job }}-${{ matrix.k8s-version }}

diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .idea
 *.iml
 .DS_Store
+*splunk_kubernetes_debug_info_*
 
 # Helm
 **/charts/*.tgz

diff --git a/tools/splunk_kubernetes_debug_info.sh b/tools/splunk_kubernetes_debug_info.sh
@@ -3,7 +3,7 @@
 # Description:
 # This script collects debugging information from a Kubernetes cluster.
 # It retrieves networking, firewall, security policies, custom resource definitions (CRDs),
-# and logs from specified pods and secrets (sanitized). The outputs are saved to files for each namespace and object type.
+# and logs from specified pods. The outputs are saved to files for each namespace and object type.
 # This helps in diagnosing and troubleshooting cluster configurations.
 # Finally, it compresses all the collected files into a ZIP archive.
 #
@@ -28,7 +28,7 @@
 #
 # Objects Scraped:
 # - Pod logs for agent, cluster-receiver, certmanager, operator, gateway, splunk pods
-# - Deployments, daemonsets, secrets, Helm releases matching K8S_OBJECT_NAME_FILTER
+# - Deployments, daemonsets, Helm releases matching K8S_OBJECT_NAME_FILTER
 # - NetworkPolicies, Services, Ingress resources, Endpoints, Roles, RoleBindings, Security contexts
 # - OpenTelemetry Instrumentation objects
 # - Custom Resource Definitions (CRDs), Pod Security Policies (PSPs), Security Context Constraints (SCCs)
@@ -87,10 +87,10 @@ write_output() {
 collect_data_namespace() {
    local ns=$1
 
-   object_types=("deployments" "daemonsets" "configmaps" "secrets" "networkpolicies" "svc" "ingress" "endpoints" "roles" "rolebindings" "otelinst")
+   object_types=("configmaps" "daemonsets" "deployments" "endpoints" "events" "ingress" "jobs" "networkpolicies" "otelinst" "rolebindings" "roles" "svc")
    for type in "${object_types[@]}"; do
     stdbuf -oL echo "Collecting $type data for $ns namespace with $k8s_object_name_filter name filter"
-     if [[ "$type" == "deployment" ||  "$type" == "daemonset" || "$type" == "configmaps" || "$type" == "secrets" ]]; then
+     if [[ "$type" == "deployment" ||  "$type" == "daemonset" || "$type" == "configmaps" ]]; then
        kubectl get "$type" -n "$ns" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep -E "$k8s_object_name_filter" | while read object; do
          cmd="kubectl get $type $object -n $ns -o yaml"
          output=$(eval "$cmd")
@@ -201,21 +201,6 @@ collect_data_cluster() {
   output=$(eval "$cmd")
   write_output "$output" "$temp_dir/cluster_custom_resource_definitions.yaml" "$cmd"
 
-  echo "Collecting pod security policies..."
-  cmd="kubectl get psp -o yaml"
-  output=$(eval "$cmd")
-  write_output "$output" "$temp_dir/cluster_pod_security_policies.yaml" "$cmd"
-
-  echo "Collecting security context constraints..."
-  cmd="kubectl get scc -o yaml"
-  output=$(eval "$cmd")
-  write_output "$output" "$temp_dir/cluster_security_context_constraints.yaml" "$cmd"
-
-  echo "Collecting MutatingWebhookConfiguration objects..."
-  cmd="kubectl get mutatingwebhookconfiguration.admissionregistration.k8s.io -o yaml; kubectl describe mutatingwebhookconfiguration.admissionregistration.k8s.io; kubectl get --raw /metrics | grep apiserver_admission_webhook_rejection_count;"
-  output=$(eval "$cmd")
-  write_output "$output" "$temp_dir/cluster_webhooks.yaml" "$cmd"
-
   echo "Checking for cert-manager installation..."
   cert_manager_pods=$(kubectl get pods --all-namespaces -l app=cert-manager --no-headers)
   if [ -n "$cert_manager_pods" ]; then
@@ -233,6 +218,33 @@ collect_data_cluster() {
   done
 }
 
+collect_cluster_resources() {
+  # List of cluster-scoped resource types to collect
+  cluster_object_types=(
+    "crds"
+    "psp"
+    "scc"
+    "mutatingwebhookconfiguration.admissionregistration.k8s.io"
+    "validatingwebhookconfiguration.admissionregistration.k8s.io"
+  )
+
+  for type in "${cluster_object_types[@]}"; do
+    echo "Collecting $type cluster-scoped resources..."
+
+    # Fetch each object's name
+    kubectl get "$type" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | while read object; do
+      # Get the API version for this object, fallback to "unknown"
+      api_version=$(kubectl get "$type" "$object" -o jsonpath='{.apiVersion}' 2>/dev/null || echo "unknown")
+      api_version=${api_version//\//_} # Sanitize slashes in API version
+
+      # Collect YAML output
+      cmd="kubectl get $type $object -o yaml"
+      output=$(eval "$cmd")
+      write_output "$output" "$temp_dir/cluster_${type//./_}_${api_version}_${object}.yaml" "$cmd"
+    done
+  done
+}
+
 # Parse input parameters
 namespaces=""
 k8s_object_name_filter="splunk|collector|otel|certmanager|test|sck|sock"
@@ -279,9 +291,12 @@ script_start_time=$(date +"%Y-%m-%d %H:%M:%S")
 echo "Script start time: $script_start_time"
 echo "Script start time: $script_start_time" >> "$output_file"
 
-# Collect cluster-wide data
+# Collect cluster instance specific data
 collect_data_cluster
 
+# Collect cluster scoped resources data
+collect_cluster_resources
+
 # Function to manage parallel processing of namespaces
 collect_data_namespace_namespaces() {
   local parallelism=20