forked from openshift/origin-aggregated-logging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentrypoint.sh
executable file
·258 lines (231 loc) · 10.6 KB
/
entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/bin/bash
# This script serves as a common entrypoint for CI infra
# as well as developers looking to run test suites for the
# project. The script can either set up a cluster to test
# or run against a cluster that is already up.
#
# Cluster end-to-end tests will be run first, followed by
# other test suites. If a specific suite or suites are req-
# uested with $SUITE, only that suite will be run.
#
# This script expects the following environment variables:
# - TEST_ONLY: do not set up a cluster. Must be paired with
# a $KUEBCONFIG that points to the cluster to test
# - SUITE: a regex that will choose which test suites are
# run. Test suite entrypoints exist under hack/testing/
# with the test- prefix. The regex in $SUITE is a simple
# filter.
# - EXCLUDE_SUITE: a regex that will choose which test suites
# are not run. Test suite entrypoints exist under hack/testing/
# with the test- prefix. The regex in $EXCLUDE_SUITE is
# a simple filter like $SUITE only with opposite effect.
# - JUNIT_REPORT: generate a jUnit XML report for tests
source "$(dirname "${BASH_SOURCE[0]}" )/../lib/init.sh"
source "${OS_O_A_L_DIR}/hack/testing/util.sh"
# we have to declare a suite start in order to use the os::cmd functions
os::test::junit::declare_suite_start "entrypoint"
LOGGING_NS=openshift-logging
if oc get project logging -o name > /dev/null 2>&1 && [ $(oc get dc -n logging -o name 2> /dev/null | wc -l) -gt 0 ] ; then
LOGGING_NS=logging
fi
export LOGGING_NS
# if using operators, turn off the managed state
if oc get clusterlogging instance > /dev/null 2>&1 ; then
oc patch -n ${LOGGING_NS} clusterlogging instance --type=json --patch '[
{"op":"replace","path":"/spec/managementState","value":"Unmanaged"}]'
fi
fluentd_ds=$( get_fluentd_ds_name )
oc get -n ${LOGGING_NS} $fluentd_ds -o yaml > "${ARTIFACT_DIR}/logging-fluentd-orig.yaml"
# patch fluentd and the node to make it easier to test in new environment
if oc get clusterlogging instance > /dev/null 2>&1 ; then
tolerations="$( oc get -n ${LOGGING_NS} $fluentd_ds -o jsonpath='{.spec.template.spec.tolerations}' )"
if [ -n "$tolerations" ] ; then
oc patch -n ${LOGGING_NS} $fluentd_ds --type=json --patch '[
{"op":"remove","path":"/spec/template/spec/tolerations"}]'
fi
nodesel="$( oc get -n ${LOGGING_NS} $fluentd_ds -o jsonpath='{.spec.template.spec.nodeSelector}' )"
if [ -z "$nodesel" ] ; then
oc patch -n ${LOGGING_NS} $fluentd_ds --type=json --patch '[
{"op":"add","path":"/spec/template/spec/nodeSelector","value":{"logging-infra-fluentd":"true"}}]'
oc patch -n ${LOGGING_NS} clusterlogging instance --type=json --patch '[
{"op":"add","path":"/spec/collection/logs/fluentd/nodeSelector","value":{"logging-infra-fluentd":"true"}},
{"op":"add","path":"/spec/collection/logs/rsyslog/nodeSelector","value":{"logging-infra-rsyslog":"true"}}]'
fi
kibnode=$( oc get -n ${LOGGING_NS} pods -l component=kibana -o jsonpath='{.items[0].spec.nodeName}' )
oc label node $kibnode --overwrite logging-infra-fluentd=true
# wait until there is only 1 fluentd running on the kibana node
os::cmd::try_until_text "oc get -n ${LOGGING_NS} $fluentd_ds -o jsonpath='{ .status.numberReady }'" '^1$' $(( 2 * minute ))
os::cmd::try_until_text "oc get -n ${LOGGING_NS} pods -l component=fluentd -o jsonpath='{.items[0].spec.nodeName}'" "$kibnode" $(( 2 * minute ))
# richm 20190117
# these tests need ability to create user with token
# check-logs test-access-control test-kibana-dashboards test-multi-tenancy
# these tests use systemctl or other apps not available in container
# test-out_rawtcp test-remote-syslog test-zzz-duplicate-entries test-zzz-rsyslog
# fails because there are no logs from apps in the default namespace
# test-read-throttling
# fails - not sure why - maybe have to run the load generators as separate pods
# test-zzzz-bulk-rejection
# cannot mount file inside pod into another pod - rewrite to use a configmap or secret
# test-viaq-data-model
expected_failures=(
test-out_rawtcp test-remote-syslog test-zzz-duplicate-entries
test-read-throttling test-viaq-data-model test-zzzz-bulk-rejection
)
else
expected_failures=(
NONE
)
# some tests expect the node to be labeled as in the ci environment
kibnode=$( oc get pods -l component=kibana -o jsonpath='{.items[0].spec.nodeName}' )
oc label node $kibnode --overwrite logging-ci-test=true
fi
stop_fluentd
# HACK HACK HACK
#
# There seems to be some sort of performance problem - richm 2017-08-15 not
# sure what has changed, but now running an all-in-one for CI, with both
# openshift master and node running as systemd services logging to the
# journal, and the default/logging pods, and the os, are spewing too much for
# fluentd to keep up with when it has 100m cpu (default), on a aws m4.xlarge
# system for now, remove the limits on fluentd to unblock the tests
if [[ -z "${USE_DEFAULT_FLUENTD_CPU_LIMIT:-}" && -n "$(oc get -n ${LOGGING_NS} $fluentd_ds -o jsonpath={.spec.template.spec.containers[0].resources.limits.cpu})" ]] ; then
oc patch -n ${LOGGING_NS} $fluentd_ds --type=json --patch '[
{"op":"remove","path":"/spec/template/spec/containers/0/resources/limits/cpu"}]'
fi
# Make CI run with enabled debug logs for journald (BZ 1505602)
oc set -n ${LOGGING_NS} env $fluentd_ds COLLECT_JOURNAL_DEBUG_LOGS=true
# Make CI run with MUX_CLIENT_MODE off by default - individual tests will set
# MUX_CLIENT_MODE=maximal or minimal
oc set -n ${LOGGING_NS} env $fluentd_ds MUX_CLIENT_MODE-
# Starting in 3.10, we can no longer mount /var/lib/docker/containers
oc volumes -n ${LOGGING_NS} $fluentd_ds --overwrite --add -t hostPath \
--name=varlibdockercontainers -m /var/lib/docker --path=/var/lib/docker || :
# we're finished hacking fluentd - start it
start_fluentd
# start a fluentd performance monitor
monitor_fluentd_top() {
# assumes running in a subshell
cp $KUBECONFIG $ARTIFACT_DIR/monitor_fluentd_top.kubeconfig
export KUBECONFIG=$ARTIFACT_DIR/monitor_fluentd_top.kubeconfig
oc project ${LOGGING_NS} > /dev/null
while true ; do
fpod=$( get_running_pod fluentd 2> /dev/null ) || :
if [ -n "$fpod" ] ; then
oc exec $fpod -- top -b -d 1 || :
else
# if we got here, the fluentd pod was restarted
echo $( date --rfc-3339=ns ) fluentd is not running
sleep 1
fi
done > $ARTIFACT_DIR/monitor_fluentd_top.log 2>&1
}
monitor_fluentd_pos() {
while true ; do
local cursor=$( get_journal_pos_cursor )
if [ -n "$cursor" ] ; then
local startts=$( date +%s )
local count=$( sudo journalctl -m -c $cursor | wc -l )
local endts=$( date +%s )
echo $endts $( expr $endts - $startts ) $count
else
echo $( date --rfc-3339=ns ) no /var/log/journal.pos
fi
sleep 1
done > $ARTIFACT_DIR/monitor_fluentd_pos.log 2>&1
}
monitor_journal_lograte() {
local interval=60
while true ; do
count=$( sudo journalctl -m -S "$( date +'%Y-%m-%d %H:%M:%S' --date="$interval seconds ago" )" | wc -l )
echo $( date +%s ) $count
sleep $interval
done > $ARTIFACT_DIR/monitor_journal_lograte.log 2>&1
}
monitor_es_bulk_stats() {
local interval=5
cp $KUBECONFIG $ARTIFACT_DIR/monitor_es_bulk_stats.kubeconfig
export KUBECONFIG=$ARTIFACT_DIR/monitor_es_bulk_stats.kubeconfig
oc project ${LOGGING_NS} > /dev/null
# wait for espod
local espod=$( get_es_pod es 2> /dev/null ) || :
while [ -z "${espod}" ] ; do
sleep 1
espod=$( get_es_pod es 2> /dev/null ) || :
done
es_ver=$( get_es_major_ver ) || :
while [ -z "${es_ver}" ] ; do
es_ver=$( get_es_major_ver ) || :
sleep 1
done
bulk_url=$( get_bulk_thread_pool_url $es_ver "v" c r a q s qs )
while true ; do
local essvc=$( get_es_svc es 2> /dev/null ) || :
local esopssvc=$( get_es_svc es-ops 2> /dev/null ) || :
esopspod=${esopssvc:-$essvc}
if [ -n "${essvc}" ] ; then
date -Ins >> $ARTIFACT_DIR/monitor_es_bulk_stats-es.log 2>&1
curl_es $essvc "${bulk_url}" >> $ARTIFACT_DIR/monitor_es_bulk_stats-es.log 2>&1 || :
fi
if [ -n "${esopssvc}" -a "${essvc}" != "${esopssvc}" ] ; then
date -Ins >> $ARTIFACT_DIR/monitor_es_bulk_stats-es-ops.log 2>&1
curl_es $esopssvc "${bulk_url}" >> $ARTIFACT_DIR/monitor_es_bulk_stats-es-ops.log 2>&1 || :
fi
sleep $interval
done
}
monitor_fluentd_top & killpids=$!
monitor_fluentd_pos & killpids="$killpids $!"
monitor_journal_lograte & killpids="$killpids $!"
monitor_es_bulk_stats & killpids="$killpids $!"
function cleanup() {
return_code=$?
kill $killpids
os::cleanup::all "${return_code}"
exit "${return_code}"
}
trap "cleanup" EXIT
rm -f ${OS_O_A_L_DIR}/temp/htpw.file
if [[ -z "${TEST_ONLY:-}" ]]; then
"${OS_O_A_L_DIR}/hack/testing/setup.sh"
elif [[ -z "${KUBECONFIG:-}" ]]; then
os::log::fatal "A \$KUBECONFIG must be specified with \$TEST_ONLY."
fi
function run_suite() {
local test="$1"
suite_name="$( basename "${test}" '.sh' )"
os::test::junit::declare_suite_start "test/setup/${suite_name}"
os::cmd::expect_success "oc login -u system:admin"
os::cmd::expect_success "oc project $LOGGING_NS"
os::test::junit::declare_suite_end
os::log::info "Logging test suite ${suite_name} started at $( date )"
ops_cluster=${ENABLE_OPS_CLUSTER:-"true"}
if OS_TMP_ENV_SET= LOG_DIR= ARTIFACT_DIR= "${test}" "${ops_cluster}"; then
os::log::info "Logging test suite ${suite_name} succeeded at $( date )"
if grep -q "${suite_name}" <<<"${expected_failures[@]}"; then
os::log::warning "Logging suite ${suite_name} is expected to fail"
fi
else
os::log::warning "Logging test suite ${suite_name} failed at $( date )"
if grep -q "${suite_name}" <<<"${expected_failures[@]}"; then
os::log::info "Logging suite ${suite_name} failure result ignored"
else
failed="true"
fi
fi
}
# done with entrypoint/boostrapping - begin main tests
os::test::junit::declare_suite_end
EXCLUDE_SUITE="${EXCLUDE_SUITE:-"$^"}"
for suite_selector in ${SUITE:-".*"} ; do
for test in $( find "${OS_O_A_L_DIR}/hack/testing" -type f -name 'check-*.sh' | grep -E "${suite_selector}" | grep -Ev "${EXCLUDE_SUITE}" | sort ); do
run_suite "${test}"
done
done
for suite_selector in ${SUITE:-".*"} ; do
for test in $( find "${OS_O_A_L_DIR}/hack/testing" -type f -name 'test-*.sh' | grep -E "${suite_selector}" | grep -Ev "${EXCLUDE_SUITE}" | sort ); do
run_suite "${test}"
done
done
if [[ -n "${failed:-}" ]]; then
exit 1
fi