-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsanitychecks.py
574 lines (487 loc) · 19.9 KB
/
sanitychecks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
# Copyright (c) 2013-2015 Centre for Advanced Internet Architectures,
# Swinburne University of Technology. All rights reserved.
#
# Author: Sebastian Zander (szander@swin.edu.au)
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
## @package sanitychecks
# Sanity checks
#
# $Id: sanitychecks.py 1313 2015-05-05 05:58:01Z szander $
import sys
import os
import re
import datetime
import config
from fabric.api import task, warn, local, run, execute, abort, hosts, \
env, settings, parallel, serial, puts, put
from hosttype import get_type_cached
from hostint import get_netint_cached, get_netint_windump_cached
from hostmac import get_netmac_cached
from trafficgens import start_iperf, start_ping, \
start_http_server, start_httperf, \
start_httperf_dash, create_http_dash_content, \
create_http_incast_content, start_httperf_incast, \
start_nttcp, start_httperf_incast_n, \
start_fps_game
def _args(*_nargs, **_kwargs):
"Collect parameters for a call"
return _nargs, _kwargs
# helper method for variable name checking
def _reg_vname(name, vnames, entry):
vnames[name] = entry
# hacky: return 0 so we don't break mathematical operations on the
# parameters and eval fails
return 0
## Check router queues
# @param queue_spec Queue specification from config
# @param vnames_referenced Dictionary of variables names to add to
def check_router_queues(queue_spec, vnames_referenced):
ids = {}
entry = 1
for c, v in queue_spec:
# insert all variable names used in vnames referenced
v = re.sub(
"(V_[a-zA-Z0-9_-]*)",
"_reg_vname('\\1', vnames_referenced, 'TPCONF_router_queues entry %s')" %
entry,
v)
eval('_args(%s)' % v)
if c in ids:
abort(
'TPCONF_router_queues entry %s: reused id value %i' %
(entry, c))
ids[c] = 1
entry += 1
## Check config file settings (TASK)
@task
def check_config():
"Check config file"
# XXX add checks for existance of some variables before using them
config_file = str(
sys.modules['config']).split(' ')[3].replace(
'\'',
'').replace(
'>',
'')
puts('Config file: %s' % config_file)
if config.TPCONF_script_path == '' or not os.path.exists(
config.TPCONF_script_path):
abort(
'TPCONF_script_path must be set to existing directory ' +
'containing the .py files')
# test if path incorrectly set
file_name = config.TPCONF_script_path + '/sanitychecks.py'
try:
with open(file_name):
pass
except IOError:
abort(
'TPCONF_script_path seems to be incorrect, could not ' +
'find sanitychecks.py')
version_file = config.TPCONF_script_path + '/VERSION'
version = 'no version info'
try:
with open(version_file) as f:
version = f.readlines()[0]
except IOError:
pass
version = version.rstrip()
puts('Script path: %s' % config.TPCONF_script_path)
puts('Script version: %s' % version)
tftpboot_dir = ''
try:
tftpboot_dir = config.TPCONF_tftpboot_dir
except AttributeError:
pass
if tftpboot_dir != '' and not os.path.exists(
tftpboot_dir):
abort('TPCONF_tftpboot_dir must be set to existing directory')
if tftpboot_dir != '':
# test if path incorrectly set
file_name = tftpboot_dir + '/conf.ipxe'
try:
with open(file_name):
pass
except IOError:
abort(
'TPCONF_tftpboot_dir seems to be incorrect, could not ' +
'find conf.ipxe')
do_power_cycle = '0'
try:
do_power_cycle = config.TPCONF_do_power_cycle
except AttributeError:
pass
if do_power_cycle == '1':
# check that we have entry for all hosts
for h in (config.TPCONF_router + config.TPCONF_hosts):
if h not in config.TPCONF_host_power_ctrlport:
abort('No entry in TPCONF_host_power_ctrlport for host %s' % h)
if config.TPCONF_power_admin_name == '':
abort('TPCONF_power_admin_name not defined')
if config.TPCONF_power_admin_pw == '':
abort('TPCONF_power_admin_pw not defined')
if config.TPCONF_test_id == '':
warn('TPCONF_test_id is not set in config file')
if len(config.TPCONF_router) == 0:
abort('TPCONF_router must define router')
if len(config.TPCONF_hosts) == 0:
abort('TPCONF_hosts must define at least one host')
# check that we have internal IPs defined
for h in (config.TPCONF_router + config.TPCONF_hosts):
if h not in config.TPCONF_host_internal_ip:
abort('Internal IP not defined for host %s' % h)
if not isinstance(config.TPCONF_host_internal_ip[h], list):
abort('Internal IP(s) are not a list for host %s' % h)
if len(config.TPCONF_host_internal_ip[h]) < 1:
abort('Must specify at least one internal IP for host %s' % h)
if h in config.TPCONF_router and len(config.TPCONF_host_internal_ip[h]) < 2:
abort('Must specify two internal IPs for router %s' % h)
# if host operating system spec exist check that we have one entry
# for each host and that OS names are correct
try:
host_os = config.TPCONF_host_os
for h in (config.TPCONF_router + config.TPCONF_hosts):
if h not in host_os:
abort('OS not defined for host %s' % h)
if host_os[h] != 'FreeBSD' and host_os[h] != 'Linux' and \
host_os[h] != 'CYGWIN' and host_os[h] != 'Darwin':
abort('Unknown OS for host %s, OS name must be FreeBSD, Linux, ' \
'CYGWIN or Darwin' % h)
if host_os[h] == 'Linux' and h in config.TPCONF_router:
try:
x = config.TPCONF_linux_kern_router
except AttributeError:
abort('If router OS is set to Linux, you must specify ' \
'TPCONF_linux_kern_router')
if host_os[h] == 'Linux' and h in config.TPCONF_hosts:
try:
x = config.TPCONF_linux_kern_hosts
except AttributeError:
abort('If host OS is set to Linux, you must specify ' \
'TPCONF_linux_kern_hosts')
except AttributeError:
pass
try:
duration = int(config.TPCONF_duration)
except ValueError:
abort('TPCONF_duration is not an integer')
try:
for v in config.TPCONF_ECN:
if v != '0' and v != '1':
abort('TPCONF_ECN entries must be either \'0\' or \'1\'')
except AttributeError:
pass
vnames_referenced = {}
if isinstance(config.TPCONF_router_queues, list):
check_router_queues(config.TPCONF_router_queues, vnames_referenced)
elif isinstance(config.TPCONF_router_queues, dict):
for router in config.TPCONF_router_queues.keys():
if router not in config.TPCONF_router:
abort('Router %s specified in TPCONF_router_queues, but '
'not listed in TPCONF_router' % router)
check_router_queues(config.TPCONF_router_queues[router],
vnames_referenced)
if len(config.TPCONF_traffic_gens) == 0:
abort('TPCONF_traffic_gens must define at least one traffic generator')
else:
ids = {}
entry = 1
for t, c, v in config.TPCONF_traffic_gens:
# insert all variable names used in vnames referenced
v = re.sub(
"(V_[a-zA-Z0-9_-]*)",
"_reg_vname('\\1', vnames_referenced, 'TPCONF_traffic_gens entry %s')" %
entry,
v)
eval('_args(%s)' % v)
try:
t = float(t)
except ValueError:
abort(
'TPCONF_traffic_gens entry %s: time is not a float' %
entry)
try:
c = int(c)
except ValueError:
abort(
'TPCONF_traffic_gens entry %s: id is not an inetger' %
entry)
if c in ids:
abort(
'TPCONF_traffic_gens entry %s: reused id value %i' %
(entry, c))
ids[c] = 1
entry += 1
for k in config.TPCONF_vary_parameters:
if k not in config.TPCONF_parameter_list:
abort(
'Parameter \'%s\' used in TPCONF_vary_parameters not defined in '
'TPCONF_parameter_list' %
k)
vnames_defined = {}
for k in config.TPCONF_parameter_list:
names, short_names, val_list, extra = config.TPCONF_parameter_list[k]
if len(names) < 1:
abort(
'Empty variable name list for parameter \'%s\' in TPCONF_parameter_list' %
k)
if len(short_names) < 1:
abort(
'Empty short name list for parameter \'%s\' in TPCONF_parameter_list' %
k)
if len(val_list) < 1:
abort(
'No parameter values for parameter \'%s\' in TPCONF_parameter_list' %
k)
if len(names) != len(short_names):
abort(
'Number of variable names and short names is not equal for parameter '
'\'%s\' in TPCONF_parameter_list' %
k)
for name in names:
if name[0:2] != 'V_':
abort(
'Variable name does not start with V_ for parameter \'%k\' in '
'TPCONF_parameter_list' %
k)
if name in vnames_defined:
abort(
'Variable name \'%s\' defined twice in TPCONF_variable_list' %
name)
vnames_defined[name] = 1
val_set = {}
for val in val_list:
if len(names) == 1:
# single values
if str(val) == '':
abort('Empty value for parameter \'%s\'' % k)
else:
# tuples
for c in range(len(val)):
if str(val[c]) == '':
abort('Empty value in tuple for parameter \'%s\'' % k)
# lookup single values or tuples converted to strings
val_str = str(val)
if val_str in val_set:
abort('Duplicate value \'%s\' for parameter \'%s\'' % (val_str, k))
val_set[val_str] = 1
for k in config.TPCONF_variable_defaults:
if k not in vnames_defined:
vnames_defined[k] = 1
for k in vnames_referenced:
if k not in vnames_defined:
abort(
'Variable name %s referenced in %s but not defined in '
'TPCONF_parameter_list or TPCONF_variable_defaults' %
(k, vnames_referenced[k]))
try:
config.TPCONF_debug_level
except AttributeError:
config.TPCONF_debug_level = 0
puts('Config file looks OK')
## Check hosts for necessary tools (TASK)
@task
@parallel
def check_host():
"Check that needed tools are installed on hosts"
# get type of current host
htype = get_type_cached(env.host_string)
# run checks
if env.host_string in config.TPCONF_router:
if htype == 'FreeBSD':
run('which ipfw')
if htype == "Linux":
run('which tc')
run('which iptables')
# XXX check that kernel tick rate is high (>= 1000)
else:
if htype == 'FreeBSD':
run('which md5')
run('which tcpdump')
elif htype == 'Darwin':
run('which md5')
run('which tcpdump')
run('which dsiftr-osx-teacup.d')
elif htype == 'Linux':
run('which ethtool')
run('which md5sum')
run('which tcpdump')
#run('which web10g-listconns')
#run('which web10g-readvars')
run('which web10g-logger')
elif htype == 'CYGWIN':
run('which WinDump', pty=False)
run('which win-estats-logger', pty=False)
# if we don't have proper ntp installed then
# start time service if not started and force resync
with settings(warn_only=True):
ret = run('ls "/cygdrive/c/Program Files (x86)/NTP/bin/ntpq"')
if ret.return_code != 0:
run('net start w32time', pty=False)
run('w32tm /resync', pty=False)
# try to enable any test network interfaces that are (accidently)
# disabled after reboot
with settings(warn_only=True):
interfaces = get_netint_cached(env.host_string, int_no=-1)
for interface in interfaces:
run('netsh int set int "Local Area Connection %s" enabled' %
interface, pty=False)
run('which killall', pty=False)
run('which pkill', pty=False)
run('which ps', pty=False)
run('which gzip', pty=False)
run('which dd', pty=False)
# check for traffic sender/receiver tools
run('which iperf', pty=False)
run('which ping', pty=False)
run('which httperf', pty=False)
run('which lighttpd', pty=False)
run('which nttcp', pty=False)
put(config.TPCONF_script_path + '/runbg_wrapper.sh', '/usr/bin')
run('chmod a+x /usr/bin/runbg_wrapper.sh', pty=False)
run('which runbg_wrapper.sh', pty=False)
put(config.TPCONF_script_path + '/kill_iperf.sh', '/usr/bin')
run('chmod a+x /usr/bin/kill_iperf.sh', pty=False)
run('which kill_iperf.sh', pty=False)
put(config.TPCONF_script_path + '/pktgen.sh', '/usr/bin')
run('chmod a+x /usr/bin/pktgen.sh', pty=False)
run('which pktgen.sh', pty=False)
## Check connectivity (and also prime switch's CAM table) (TASK)
@task
@parallel
def check_connectivity():
"Check connectivity between each pair of hosts with ping"
# get type of current host
htype = get_type_cached(env.host_string)
all_hosts = config.TPCONF_router + config.TPCONF_hosts
for host in all_hosts:
for ihost in config.TPCONF_host_internal_ip[host]:
if htype == "CYGWIN":
run('ping -n 2 %s' % ihost, pty=False)
else:
run('ping -c 2 %s' % ihost, pty=False)
## Check time synchronisation with control machine (should not run in parallel)
## This is only a simple check to detect if clocks are completely out of sync
## Assumes: the control machine is synchronised (i.e. uses NTP)
@task
def check_time_sync():
"Check time synchronisation between control host and testbed host clocks"
allowed_time_diff = 1
try:
allowed_time_diff = config.TPCONF_max_time_diff
except AttributeError:
pass
# get type of current host
htype = get_type_cached(env.host_string)
# get timestamps in unix time to avoid having to do time format conversions
# XXX should get timestamps in milliseconds, cause now we have huge quantisation
# error, but how to do this in a portable way?
t1 = datetime.datetime.now()
if htype == 'FreeBSD' or htype == 'Linux' or htype == 'Darwin':
rdate = run('date +\'%s\'')
elif htype == 'CYGWIN':
rdate = run('date +\'%s\'', pty=False)
ldate = local('date +\'%s\'', capture=True)
t2 = datetime.datetime.now()
dt_diff = t2 - t1
sec_diff = (dt_diff.days * 24 * 3600 + dt_diff.seconds) + \
(dt_diff.microseconds / 1000000.0)
puts(
'Local time: %s, remote time: %s, proc delay: %s' %
(ldate, rdate, str(sec_diff)))
diff = abs(int(ldate) - int(rdate) - sec_diff)
if diff > allowed_time_diff:
abort(
'Host %s time synchronisation error (difference > %s seconds)' %
(env.host_string, str(allowed_time_diff)))
## Kill any old processes (TASK)
@task
@parallel
def kill_old_processes():
"Kill old logging or traffic generation processes still running"
# get type of current host
htype = get_type_cached(env.host_string)
with settings(warn_only=True):
if htype == 'FreeBSD':
run('killall tcpdump', pty=False)
elif htype == 'Linux':
run('killall tcpdump', pty=False)
#run('killall web10g_logger.sh')
run('killall web10g-logger')
elif htype == 'Darwin':
run('killall tcpdump', pty=False)
run('killall dsiftr-osx-teacup.d', pty=False)
elif htype == 'CYGWIN':
run('killall WinDump', pty=False)
run('killall win-estats-logger', pty=False)
if htype == 'CYGWIN':
# on new cygwin does stop anymore on sigterm
run('killall -9 iperf', pty=False)
else:
run('killall iperf', pty=False)
run('killall ping', pty=False)
run('killall httperf', pty=False)
run('killall lighttpd', pty=False)
# delete old lighttp pid files (XXX would be better to delete after
# experiment)
run('rm -f /var/run/*lighttpd.pid', pty=False)
run('killall runbg_wrapper.sh', pty=False)
run('killall nttcp')
run('killall pktgen.sh ; killall python')
# remove old log stuff in /tmp
run('rm -f /tmp/*.log', pty=False)
## Collect host info, prefill caches (must not be run in parallel!!!)
## any parallel task cannot fill the caches cause the parallel execution
## is done with fork()
# @param htype '0' don't get host OS, '1' get host OS
# @param netint '0' don't get network interface names,
# '1' get network interface names
# @param netmac '0' don't get MAC addresses, '1' get MAC addresses
@serial
def get_host_info(htype='1', netint='1', netmac='1'):
"Populate the host info caches"
if htype == '1':
get_type_cached(env.host_string)
if netint == '1':
get_netint_cached(env.host_string, int_no=-1)
get_netint_windump_cached(env.host_string, int_no=-1)
get_netint_cached(env.host_string, int_no=-1, internal_int='0')
get_netint_windump_cached(env.host_string, int_no=-1,
internal_int='0')
if netmac == '1':
get_netmac_cached(env.host_string)
## Run all sanity checks
@task
def sanity_checks():
"Perform all sanity checks, e.g. check for needed tools and connectivity"
execute(check_host, hosts=config.TPCONF_router + config.TPCONF_hosts)
execute(
check_connectivity,
hosts=config.TPCONF_router +
config.TPCONF_hosts)
execute(
kill_old_processes,
hosts=config.TPCONF_router +
config.TPCONF_hosts)
execute(check_time_sync, hosts=config.TPCONF_router + config.TPCONF_hosts)