Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make otel ta tests a bit less flaky and the error logs a bit better #5873

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash -eux
set -o pipefail
which jq || (echo "jq not found" && exit 1)
source "${SOURCE_DIR}/packaging-scripts/cicd-tests/add-access-token.sh"
source "${SOURCE_DIR}/packaging-scripts/cicd-tests/test-utils.sh"
BUILD_DIR="$(realpath "$BUILD_DIR")"

CI_JOB_ID="${CI_JOB_ID:-$(mktemp -d)}"
Expand Down Expand Up @@ -48,67 +48,78 @@ GATEWAY_AGENT_LOGS_DIR="$TEST_FOLDER/$GATEWAY_AGENT_REPACKED_TA_NAME/"
mkdir -p "$GATEWAY_AGENT_LOGS_DIR"

# It can take quite some time to extract the agent bundle. Await for it before trying to pull otel.log.
MAX_ATTEMPTS=6
DELAY=60
if [ "$PLATFORM" == "windows" ]; then
MAX_ATTEMPTS=96
else
MAX_ATTEMPTS=36
fi
DELAY=10
ATTEMPT=1
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_IPV4_ADDR":/opt/splunk/var/log/splunk/Splunk_TA_otel.log "$GATEWAY_LOGS_DIR"
if grep -qi "Done extracting agent bundle" "$GATEWAY_LOGS_DIR/Splunk_TA_otel.log"; then
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_IPV4_ADDR":/opt/splunk/var/log/splunk/ "$GATEWAY_LOGS_DIR"
if safe_grep_log "Done extracting agent bundle" "$GATEWAY_LOGS_DIR/splunk/Splunk_TA_otel.log"; then
break
fi
echo "Extraction not complete according to Splunk_TA_otel.log... Retrying in $DELAY seconds"
ATTEMPT=$((ATTEMPT + 1))
sleep $DELAY
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to extract agent bundle after $MAX_ATTEMPTS attempts."
cat "$GATEWAY_LOGS_DIR/Splunk_TA_otel.log"
safe_tail "$GATEWAY_LOGS_DIR/splunk/splunkd.log" 200
safe_tail "$GATEWAY_LOGS_DIR/splunk/Splunk_TA_otel.log"
echo "Failed to extract agent bundle after $MAX_ATTEMPTS attempts. Logs above."
exit 1
fi

MAX_ATTEMPTS=6
MAX_ATTEMPTS=12
DELAY=10
ATTEMPT=1
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_IPV4_ADDR":/opt/splunk/var/log/splunk/otel.log "$GATEWAY_LOGS_DIR"
if grep -qi "Everything is ready" "$GATEWAY_LOGS_DIR/otel.log"; then
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_IPV4_ADDR":/opt/splunk/var/log/splunk/ "$GATEWAY_LOGS_DIR"
if safe_grep_log "Everything is ready" "$GATEWAY_LOGS_DIR/splunk/otel.log"; then
break
fi
echo "Did not see startup message according to otel.log... Retrying in $DELAY seconds"
ATTEMPT=$((ATTEMPT + 1))
sleep $DELAY
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to see startup message in otel.log after $MAX_ATTEMPTS attempts."
cat "$GATEWAY_LOGS_DIR/otel.log"
safe_tail "$GATEWAY_LOGS_DIR/splunk/Splunk_TA_otel.log"
safe_tail "$GATEWAY_LOGS_DIR/splunk/otel.log"
echo "Failed to see startup message in otel.log after $MAX_ATTEMPTS attempts. Logs above."
exit 1
fi

# It can take quite some time to extract the agent bundle (+7 minutes between start and end log message). Await for it before trying to pull otel.log.
MAX_ATTEMPTS=12
DELAY=60
if [ "$PLATFORM" == "windows" ]; then
MAX_ATTEMPTS=96
else
MAX_ATTEMPTS=36
fi
DELAY=10
ATTEMPT=1
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_AGENT_IPV4_ADDR":/opt/splunk/var/log/splunk/Splunk_TA_otel.log "$GATEWAY_AGENT_LOGS_DIR"
if grep -qi "Done extracting agent bundle" "$GATEWAY_AGENT_LOGS_DIR/Splunk_TA_otel.log"; then
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_AGENT_IPV4_ADDR":/opt/splunk/var/log/splunk/ "$GATEWAY_AGENT_LOGS_DIR"
if safe_grep_log "Done extracting agent bundle" "$GATEWAY_AGENT_LOGS_DIR/splunk/Splunk_TA_otel.log"; then
break
fi
echo "Extraction not complete according to Splunk_TA_otel.log... Retrying in $DELAY seconds"
ATTEMPT=$((ATTEMPT + 1))
sleep $DELAY
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to extract agent bundle after $MAX_ATTEMPTS attempts."
cat "$GATEWAY_AGENT_LOGS_DIR/Splunk_TA_otel.log"
safe_tail "$GATEWAY_AGENT_LOGS_DIR/splunk/splunkd.log" 200
safe_tail "$GATEWAY_AGENT_LOGS_DIR/splunk/Splunk_TA_otel.log"
echo "Failed to extract agent bundle after $MAX_ATTEMPTS attempts. Logs above if present."
exit 1
fi

MAX_ATTEMPTS=6
MAX_ATTEMPTS=24
DELAY=10
ATTEMPT=1
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
scp -i ~/.orca/id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r "$ORCA_SSH_USER@$GATEWAY_AGENT_IPV4_ADDR":/opt/splunk/var/log/splunk/otel.log "$GATEWAY_AGENT_LOGS_DIR"
if grep -qi "Everything is ready" "$GATEWAY_AGENT_LOGS_DIR/otel.log"; then
if safe_grep_log "Everything is ready" "$GATEWAY_AGENT_LOGS_DIR/otel.log"; then
break
fi
echo "Did not see startup message according to otel.log... Retrying in $DELAY seconds"
Expand All @@ -117,7 +128,7 @@ while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to see startup message in otel.log after $MAX_ATTEMPTS attempts."
cat "$GATEWAY_AGENT_LOGS_DIR/otel.log"
safe_tail "$GATEWAY_AGENT_LOGS_DIR/otel.log"
exit 1
fi

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -o pipefail
which jq || (echo "jq not found" && exit 1)
source "${SOURCE_DIR}/packaging-scripts/cicd-tests/add-access-token.sh"
source "${SOURCE_DIR}/packaging-scripts/cicd-tests/test-utils.sh"
BUILD_DIR="$(realpath "$BUILD_DIR")"
TA_FULLPATH="$(repack_with_access_token "$OLLY_ACCESS_TOKEN" "$BUILD_DIR/out/distribution/Splunk_TA_otel.tgz" | tail -n 1)"
CI_JOB_ID="${CI_JOB_ID:-$(basename $(dirname "$TA_FULLPATH"))}"
Expand All @@ -15,29 +15,28 @@ deployment_id="$(jq -r '.orca_deployment_id' < "$TEST_FOLDER/orca_deployment.jso
ip_addr="$(jq -r '.server_roles.standalone[0].host' < "$TEST_FOLDER/orca_deployment.json")"

# Check for successful startup
ATTEMPT=1
if [ "$PLATFORM" == "windows" ]; then
MAX_ATTEMPTS=12 # Windows takes a long time to extract, often 7 minutes on default hardware
DELAY=60
MAX_ATTEMPTS=96 # Windows takes a long time to extract, often 7 minutes on default hardware
else
MAX_ATTEMPTS=6
DELAY=20
MAX_ATTEMPTS=36
fi
ATTEMPT=1
DELAY=10
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
# Copy logs from container
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r -i ~/.orca/id_rsa "splunk@$ip_addr:/opt/splunk/var/log/splunk/" "$TEST_FOLDER"
if grep -q "Starting otel agent" "$TEST_FOLDER/splunk/Splunk_TA_otel.log" &&
grep -q "Everything is ready" "$TEST_FOLDER/splunk/otel.log"; then
if safe_grep_log "Starting otel agent" "$TEST_FOLDER/splunk/Splunk_TA_otel.log" &&
safe_grep_log "Everything is ready" "$TEST_FOLDER/splunk/otel.log"; then
break
fi
ATTEMPT=$((ATTEMPT + 1))
sleep "$DELAY"
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to find successful startup message(s) after $MAX_ATTEMPTS attempts."
cat "$TEST_FOLDER/splunk/splunkd.log"
cat "$TEST_FOLDER/splunk/Splunk_TA_otel.log"
cat "$TEST_FOLDER/splunk/otel.log"
safe_tail "$TEST_FOLDER/splunk/splunkd.log" 200
safe_tail "$TEST_FOLDER/splunk/Splunk_TA_otel.log"
safe_tail "$TEST_FOLDER/splunk/otel.log"
echo "Failed to find successful startup message(s) after $MAX_ATTEMPTS attempts. Logs above."
exit 1
fi

Expand All @@ -62,17 +61,17 @@ while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
sleep $DELAY
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to find metrics within $CUTOFF_DELTA after $MAX_ATTEMPTS attempts."
cat "$TEST_FOLDER/splunk/otel.log"
cat "$TEST_FOLDER/uptime.json"
safe_tail "$TEST_FOLDER/splunk/otel.log"
safe_tail "$TEST_FOLDER/uptime.json"
echo "Failed to find metrics within $CUTOFF_DELTA after $MAX_ATTEMPTS attempts. Logs above."
exit 1
fi

# Verify the addon can be restarted successfully
orca_container_name=$(splunk_orca --cloud "${ORCA_CLOUD}" --printer json show --deployment-id "${deployment_id}" containers | jq -r '.[keys[0]] | .[keys[0]] | .containers | keys[0]')
splunk_orca --cloud "${ORCA_CLOUD}" exec --exec-user splunk "${orca_container_name}" '/opt/splunk/bin/splunk restart'

MAX_ATTEMPTS=30
MAX_ATTEMPTS=12
DELAY=10
ATTEMPT=1
if [ "$PLATFORM" == "windows" ]; then
Expand All @@ -83,34 +82,34 @@ fi
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r -i ~/.orca/id_rsa "splunk@$ip_addr:/opt/splunk/var/log/splunk/$restart_log_file" "$TEST_FOLDER/splunk/$restart_log_file"
# There seems to be an issue on linux where it does not gracefully wait for the job to shut down, need to investigate further.
(grep -q "INFO Otel agent stop" "$TEST_FOLDER/splunk/$restart_log_file" || grep -q "INFO Stopping otel" "$TEST_FOLDER/splunk/$restart_log_file") && break
(safe_grep_log "INFO Otel agent stop" "$TEST_FOLDER/splunk/$restart_log_file" || safe_grep_log "INFO Stopping otel" "$TEST_FOLDER/splunk/$restart_log_file") && break
ATTEMPT=$((ATTEMPT + 1))
sleep $DELAY
done

if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to see restart log after $MAX_ATTEMPTS attempts."
cat "$TEST_FOLDER/splunk/$restart_log_file"
echo "Failed to see restart log after $MAX_ATTEMPTS attempts. Logs above."
exit 1
fi

# Ensure restart was successful as well
MAX_ATTEMPTS=24
DELAY=10
ATTEMPT=1
MAX_ATTEMPTS=6
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r -i ~/.orca/id_rsa "splunk@$ip_addr:/opt/splunk/var/log/splunk/Splunk_TA_otel.log" "$TEST_FOLDER/splunk/Splunk_TA_otel.log"
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -r -i ~/.orca/id_rsa "splunk@$ip_addr:/opt/splunk/var/log/splunk/otel.log" "$TEST_FOLDER/splunk/otel.log"
if grep -q "Starting otel agent" "$TEST_FOLDER/splunk/Splunk_TA_otel.log" && grep -q "Everything is ready" "$TEST_FOLDER/splunk/otel.log"; then
if safe_grep_log "Starting otel agent" "$TEST_FOLDER/splunk/Splunk_TA_otel.log" && safe_grep_log "Everything is ready" "$TEST_FOLDER/splunk/otel.log"; then
break
fi
ATTEMPT=$((ATTEMPT + 1))
sleep $DELAY
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Failed to see restarted log after $MAX_ATTEMPTS attempts."
cat "$TEST_FOLDER/splunk/Splunk_TA_otel.log"
cat "$TEST_FOLDER/splunk/otel.log"
safe_tail "$TEST_FOLDER/splunk/Splunk_TA_otel.log"
safe_tail "$TEST_FOLDER/splunk/otel.log"
echo "Failed to see restarted log after $MAX_ATTEMPTS attempts. Logs above."
exit 1
fi

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,27 @@ repack_with_access_token() {
echo "$repacked"
return 0
}

safe_tail() {
filename="$1"
set +u
taillines="$2"
set -u

if [ "$taillines" ]; then
([ -f "$filename" ] && tail -n "$taillines" "$filename") || echo "File $filename not found"
else
([ -f "$filename" ] && cat "$taillines" "$filename") || echo "File $filename not found"
fi
}

safe_grep_log() {
searchstring="$1"
filename="$2"
if [ -f "$filename" ]; then
return (grep -qi "$searchstring" "$filename")
else
echo "$filename not found"
return 1
fi
}
Loading