CSMDS-315: Extend report.sh with Kafka custom resource dump (strimzi#22)

CSMDS-321: Dump all Kafka resources in report.sh (strimzi#24) CSMDS-329: Add all topic describe to report.sh (strimzi#37) CSMDS-420: Fix report.sh to not fail when Kafka resource is being deleted during script run (strimzi#39) CSMDS-317: Add java_thread_dump.sh to dump Java threads of all containers o… (strimzi#23)
david-simon · Mar 28, 2024 · 85e24f1 · 85e24f1
1 parent 81ec77a
commit 85e24f1
Show file tree

Hide file tree

Showing 2 changed files with 216 additions and 0 deletions.
diff --git a/tools/java_thread_dump.sh b/tools/java_thread_dump.sh
@@ -0,0 +1,177 @@
+#!/usr/bin/env bash
+# Self contained Strimzi thread dump tool.
+set -Eeuo pipefail
+if [[ $(uname -s) == "Darwin" ]]; then
+  shopt -s expand_aliases
+  alias echo="gecho"; alias grep="ggrep"; alias sed="gsed"; alias date="gdate"
+fi
+
+error() {
+  echo "$@" 1>&2 && exit 1
+}
+
+{ # this ensures that the entire script is downloaded #
+KUBECTL_INSTALLED=false
+OC_INSTALLED=false
+KUBE_CLIENT="kubectl"
+CONTAINER=""
+OUT_DIR=""
+DUMPS=1
+INTERVAL=5
+readonly JCMD_LIST_CMD="jcmd -l | grep -v JCmd"
+readonly JCMD_DUMP_CMD_TMPL="jcmd PID Thread.print"
+
+# bash version check
+if [[ -z ${BASH_VERSINFO+x} ]]; then
+  error "No bash version information available, aborting"
+fi
+if [[ "${BASH_VERSINFO[0]}" -lt 4 ]]; then
+  error "You need bash version >= 4 to run the script"
+fi
+
+# kube client check
+if [[ -x "$(command -v kubectl)" ]]; then
+  KUBECTL_INSTALLED=true
+else
+  if [[ -x "$(command -v oc)" ]]; then
+    OC_INSTALLED=true
+    KUBE_CLIENT="oc"
+  fi
+fi
+if [[ $OC_INSTALLED = false && $KUBECTL_INSTALLED = false ]]; then
+  error "There is no kubectl or oc installed"
+fi
+
+# kube connectivity check
+$KUBE_CLIENT version -o yaml --request-timeout=5s 1>/dev/null
+
+readonly USAGE="
+Usage: java_thread_dump.sh [options]
+This tool dumps the threads of all Java processes running in the containers of a specific pod.
+
+Required:
+  --namespace=<string>          Kubernetes namespace.
+  --pod=<string>                Pod name. Must be a cluster operator, entity operator, kafka, zookeeper or cruise control pod.
+
+Optional:
+  --container=<string>          Container name to limit the thread dump to. By default, all containers are captured with thread dump.
+  --out-dir=<string>            Script output directory.
+  --dumps=<int>                 Number of thread dumps to capture. 1 by default.
+  --interval=<int>              Number of seconds to wait between 2 dumps. 5 by default.
+"
+OPTSPEC=":-:"
+while getopts "$OPTSPEC" optchar; do
+  case "${optchar}" in
+    -)
+      case "${OPTARG}" in
+        namespace=*)
+          NAMESPACE=${OPTARG#*=} && readonly NAMESPACE
+          ;;
+        pod=*)
+          POD=${OPTARG#*=} && readonly POD
+          ;;
+        container=*)
+          CONTAINER=${OPTARG#*=} && readonly CONTAINER
+          ;;
+        out-dir=*)
+          OUT_DIR=${OPTARG#*=}
+          OUT_DIR=${OUT_DIR//\~/$HOME} && readonly OUT_DIR
+          ;;
+        dumps=*)
+          DUMPS=${OPTARG#*=} && readonly DUMPS
+          ;;
+        interval=*)
+          INTERVAL=${OPTARG#*=} && readonly INTERVAL
+          ;;
+        *)
+          error "$USAGE"
+          ;;
+      esac;;
+  esac
+done
+shift $((OPTIND-1))
+
+if [[ -z $NAMESPACE || -z $POD ]]; then
+  error "$USAGE"
+fi
+
+if [[ -z $OUT_DIR ]]; then
+  OUT_DIR="$(mktemp -d)"
+fi
+
+if [[ -z $($KUBE_CLIENT get ns "$NAMESPACE" -o name --ignore-not-found) ]]; then
+  error "Namespace $NAMESPACE not found! Exiting"
+fi
+
+mkdir -p "$OUT_DIR/dumps"
+
+declare -a containers
+if [[ -z $CONTAINER ]]; then
+  container_list=$($KUBE_CLIENT get pod -n "$NAMESPACE" "$POD" -ojsonpath="{.spec.containers[*].name}")
+  for c in $container_list;
+  do
+    containers+=("$c")
+  done
+else
+  containers+=("$CONTAINER")
+fi
+
+dump_count=0
+for (( i=0 ; i<DUMPS ; i++ ));
+do
+    if [[ $i -ne 0 ]]; then
+      echo "Backing off for ${INTERVAL}s"
+      sleep "$INTERVAL"
+    fi
+
+    for c in "${containers[@]}";
+    do
+      java_processes_list=$($KUBE_CLIENT exec -n "$NAMESPACE" "$POD" -c "$c" -- /bin/bash -c "$JCMD_LIST_CMD" 2>/dev/null) || true
+      if [[ -z "$java_processes_list" ]]; then
+        echo "Skipping container $c as it does not have a running Java process"
+        continue
+      fi
+
+      declare -a jprocesses
+      jprocesses=()
+      while read -r line
+      do
+          jprocesses+=("$line")
+      done <<< "$java_processes_list"
+
+      mkdir -p "$OUT_DIR/dumps/$c"
+
+      for line in "${jprocesses[@]}"; do
+          pid=$(echo "$line" | cut -f1 -d' ')
+          main_class=$(echo "$line" | cut -f2 -d' ')
+
+          echo "Dumping threads from container ${c} PID ${pid} main class ${main_class} #${i}"
+
+          dump_file_name="thread_dump-${c}-${pid}-${main_class}"
+          if [[ $DUMPS -ne 1 ]]; then
+            dump_file_name+="-$i"
+          fi
+          dump_file_name+=".txt"
+
+          dump_cmd=${JCMD_DUMP_CMD_TMPL/"PID"/"$pid"}
+          $KUBE_CLIENT exec -n "$NAMESPACE" "$POD" -c "$c" -- /bin/bash -c "$dump_cmd" > "${OUT_DIR}/dumps/${c}/$dump_file_name"
+          ((++dump_count))
+      done
+    done
+done
+
+if [[ $dump_count -eq 0 ]]; then
+  error "Could not capture any thread dumps in the specified pod"
+fi
+
+FILENAME="tdumps-${NAMESPACE}-${POD}-$(date +"%d-%m-%Y_%H-%M-%S")"
+OLD_DIR="$(pwd)"
+cd "$OUT_DIR" || exit
+zip -qr "$FILENAME".zip ./dumps/
+cd "$OLD_DIR" || exit
+if [[ $OUT_DIR == *"tmp."* ]]; then
+  # keeping the old behavior when --out-dir is not specified
+  mv "$OUT_DIR"/"$FILENAME".zip ./
+fi
+echo "Thread dump collection file $FILENAME.zip created"
+} # this ensures that the entire script is downloaded #
diff --git a/tools/report.sh b/tools/report.sh
@@ -204,6 +204,32 @@ for RES in "${RESOURCES[@]}"; do
   fi
 done
 
+echo "describe topics"
+get_topic_describe() {
+  pod=$($KUBE_CLIENT -n "$NAMESPACE" get po -l strimzi.io/kind=Kafka,strimzi.io/name="$CLUSTER-kafka" --ignore-not-found --no-headers -o jsonpath='{range .items[*]}{.status.containerStatuses[*].ready.true}{.metadata.name}{ "\n"}{end}' | head -n 1)
+
+  if [[ -n "$pod" ]]; then
+    mkdir -p "$OUT_DIR"/reports/topics
+
+    $KUBE_CLIENT -n "$NAMESPACE" exec "$pod" -- bash -c '# Extract variables from strimzi.properties && \
+        listener=$(grep "control.plane.listener.name" /tmp/strimzi.properties | sed -e "s/control.plane.listener.name=//g") && \
+        port=$(grep "control.plane.listener.name" /tmp/strimzi.properties | sed -e "s/.*-//g") && \
+        bootstrapserver=$(grep "advertised.listeners" /tmp/strimzi.properties | sed -e "s/.*$listener:\/\/\(.*\):$port.*/\1/") && \
+        # Create client config file && \
+        grep -i "listener.name.$listener." /tmp/strimzi.properties | sed -e "s/listener.name.$listener.//gI" > /tmp/report-client-config.properties && \
+        echo "security.protocol=ssl" >> /tmp/report-client-config.properties && \
+        # Execute topic describe && \
+        bin/kafka-topics.sh --describe --command-config=/tmp/report-client-config.properties --bootstrap-server $bootstrapserver:$port' \
+        > "$OUT_DIR"/reports/topics/topic-describe.txt 2>/dev/null||true
+    $KUBE_CLIENT -n "$NAMESPACE" exec "$pod" -- bash -c 'rm -rf /tmp/report-client-config.properties' 2>/dev/null||true
+
+    echo "    topic describe executed, /reports/topics/topic-describe.txt created"
+  else
+    echo "    topic describe failed due to no kafka pods available"
+  fi
+}
+get_topic_describe
+
 get_nonnamespaced_yamls() {
   local type="$1"
   mkdir -p "$OUT_DIR"/reports/"$type"
@@ -304,6 +330,19 @@ for CRD in $CRDS; do
   fi
 done
 
+echo "all kafkas"
+mkdir -p "$OUT_DIR"/reports/all_kafkas
+mapfile -t KAFKA_CLUSTERS < <($KUBE_CLIENT get kafkas --all-namespaces --ignore-not-found --no-headers -ojsonpath="{range .items[*]}{.metadata.namespace}/{.metadata.name}{'\n'}{end}")
+if [[ ${#KAFKA_CLUSTERS[@]} -ne 0 ]]; then
+  for kafka_cluster in "${KAFKA_CLUSTERS[@]}"; do
+    echo "    $kafka_cluster"
+    kafka_cluster_ns=${kafka_cluster%/*}
+    kafka_cluster_name=${kafka_cluster#*/}
+    mkdir -p "$OUT_DIR/reports/all_kafkas/$kafka_cluster_ns"
+    $KUBE_CLIENT get kafka -n "$kafka_cluster_ns" "$kafka_cluster_name" -o yaml > "$OUT_DIR/reports/all_kafkas/$kafka_cluster_ns/$kafka_cluster_name.yaml"||true
+  done
+fi
+
 echo "events"
 EVENTS=$($KUBE_CLIENT get event -n "$NAMESPACE" --ignore-not-found) && readonly EVENTS
 if [[ -n $EVENTS ]]; then