From 8ab08708fcd2100ed5b9e344060156cc2a90f146 Mon Sep 17 00:00:00 2001 From: Daniel Mitterdorfer Date: Tue, 17 Sep 2019 16:08:08 +0200 Subject: [PATCH 1/4] Add heapdump telemetry device With this commit we add a new telemetry device `heapdump` that allows users to capture a heapdump right before a node is shutdown. This can be useful together with `--on-error=abort` where we then want to know the state of the heap at this point. --- docs/telemetry.rst | 6 ++++++ esrally/mechanic/telemetry.py | 24 ++++++++++++++++++++++-- tests/mechanic/telemetry_test.py | 12 ++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/docs/telemetry.rst b/docs/telemetry.rst index 05d3c9db1..b1f011b48 100644 --- a/docs/telemetry.rst +++ b/docs/telemetry.rst @@ -21,6 +21,7 @@ You probably want to gain additional insights from a race. Therefore, we have ad jit JIT Compiler Profiler Enables JIT compiler logs. gc GC log Enables GC logs. jfr Flight Recorder Enables Java Flight Recorder (requires an Oracle JDK or OpenJDK 11+) + heapdump Heap Dump Captures a heap dump. node-stats Node Stats Regularly samples node stats recovery-stats Recovery Stats Regularly samples shard recovery stats ccr-stats CCR Stats Regularly samples Cross Cluster Replication (CCR) related stats @@ -66,6 +67,11 @@ gc The ``gc`` telemetry device enables GC logs for the benchmark candidate. You can use tools like `GCViewer `_ to analyze the GC logs. +heapdump +-------- + +The ``heapdump`` telemetry device will capture a heap dump after a benchmark has finished and right before the node is shutdown. + node-stats ---------- diff --git a/esrally/mechanic/telemetry.py b/esrally/mechanic/telemetry.py index c2f34a8ad..e112f946d 100644 --- a/esrally/mechanic/telemetry.py +++ b/esrally/mechanic/telemetry.py @@ -25,13 +25,14 @@ from esrally import metrics, time, exceptions from esrally.metrics import MetaInfoScope -from esrally.utils import io, sysstats, console, opts +from esrally.utils import io, sysstats, console, opts, process def list_telemetry(): console.println("Available telemetry devices:\n") devices = [[device.command, device.human_name, device.help] for device in [JitCompiler, Gc, FlightRecorder, - NodeStats, RecoveryStats, CcrStats]] + Heapdump, NodeStats, RecoveryStats, + CcrStats]] console.println(tabulate.tabulate(devices, ["Command", "Name", "Description"])) console.println("\nKeep in mind that each telemetry device may incur a runtime overhead which can skew results.") @@ -259,6 +260,25 @@ def java_opts(self, log_file): return ["-Xlog:gc*=info,safepoint=info,age*=trace:file={}:utctime,uptimemillis,level,tags:filecount=0".format(log_file)] +class Heapdump(TelemetryDevice): + internal = False + command = "heapdump" + human_name = "Heap Dump" + help = "Captures a heap dump." + + def __init__(self, log_root): + super().__init__() + self.log_root = log_root + + def detach_from_node(self, node, running): + if running: + heap_dump_file = os.path.join(self.log_root, "heap_at_exit_{}.hprof".format(node.pid)) + self.logger.info("Writing heap dump to [%s]", heap_dump_file) + cmd = "jmap -dump:live,format=b,file={} {}".format(heap_dump_file, node.pid) + if process.run_subprocess_with_logging(cmd): + self.logger.warning("Could not write heap dump to [%s]", heap_dump_file) + + class CcrStats(TelemetryDevice): internal = False command = "ccr-stats" diff --git a/tests/mechanic/telemetry_test.py b/tests/mechanic/telemetry_test.py index fd1716c42..b110b464f 100644 --- a/tests/mechanic/telemetry_test.py +++ b/tests/mechanic/telemetry_test.py @@ -227,6 +227,18 @@ def test_sets_options_for_java_9_or_above(self): gc_java_opts) +class HeapdumpTests(TestCase): + @mock.patch("esrally.utils.process.run_subprocess_with_logging") + def test_generates_heap_dump(self, run_subprocess_with_logging): + run_subprocess_with_logging.return_value = 0 + heapdump = telemetry.Heapdump("/var/log") + t = telemetry.Telemetry(enabled_devices=[heapdump.command], devices=[heapdump]) + node = cluster.Node(pid="1234", host_name="localhost", node_name="rally0", telemetry=t) + t.attach_to_node(node) + t.detach_from_node(node, running=True) + run_subprocess_with_logging.assert_called_with("jmap -dump:live,format=b,file=/var/log/heap_at_exit_1234.hprof 1234") + + class CcrStatsTests(TestCase): def test_negative_sample_interval_forbidden(self): clients = {"default": Client(), "cluster_b": Client()} From 01d78805c2a9f7fc833dd84200e4a26d938ecf3b Mon Sep 17 00:00:00 2001 From: Daniel Mitterdorfer Date: Tue, 17 Sep 2019 17:06:13 +0200 Subject: [PATCH 2/4] Add telemetry device to launcher --- esrally/mechanic/launcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/esrally/mechanic/launcher.py b/esrally/mechanic/launcher.py index 095c70939..f8acd1367 100644 --- a/esrally/mechanic/launcher.py +++ b/esrally/mechanic/launcher.py @@ -316,6 +316,7 @@ def _start_node(self, node_configuration, node_count_on_host): telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), + telemetry.Heapdump(node_telemetry_dir), telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), From 37af12cb8ce9be4728afb5414deff6be4bd9bf31 Mon Sep 17 00:00:00 2001 From: Daniel Mitterdorfer Date: Wed, 18 Sep 2019 15:02:50 +0200 Subject: [PATCH 3/4] Fine-tuning --- esrally/mechanic/telemetry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/esrally/mechanic/telemetry.py b/esrally/mechanic/telemetry.py index e112f946d..680e703e9 100644 --- a/esrally/mechanic/telemetry.py +++ b/esrally/mechanic/telemetry.py @@ -273,8 +273,8 @@ def __init__(self, log_root): def detach_from_node(self, node, running): if running: heap_dump_file = os.path.join(self.log_root, "heap_at_exit_{}.hprof".format(node.pid)) - self.logger.info("Writing heap dump to [%s]", heap_dump_file) - cmd = "jmap -dump:live,format=b,file={} {}".format(heap_dump_file, node.pid) + console.info("{}: Writing heap dump to [{}}]".format(self.human_name, heap_dump_file), logger=self.logger) + cmd = "jmap -dump:format=b,file={} {}".format(heap_dump_file, node.pid) if process.run_subprocess_with_logging(cmd): self.logger.warning("Could not write heap dump to [%s]", heap_dump_file) From e44cbfeddab4d4a8ad41e074c834851cbb65bac4 Mon Sep 17 00:00:00 2001 From: Daniel Mitterdorfer Date: Tue, 22 Oct 2019 13:26:53 +0200 Subject: [PATCH 4/4] Address review comments --- esrally/telemetry.py | 2 +- tests/telemetry_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/esrally/telemetry.py b/esrally/telemetry.py index 5e2fbb611..72cdb0a27 100644 --- a/esrally/telemetry.py +++ b/esrally/telemetry.py @@ -257,7 +257,7 @@ def __init__(self, log_root): def detach_from_node(self, node, running): if running: heap_dump_file = os.path.join(self.log_root, "heap_at_exit_{}.hprof".format(node.pid)) - console.info("{}: Writing heap dump to [{}}]".format(self.human_name, heap_dump_file), logger=self.logger) + console.info("{}: Writing heap dump to [{}]".format(self.human_name, heap_dump_file), logger=self.logger) cmd = "jmap -dump:format=b,file={} {}".format(heap_dump_file, node.pid) if process.run_subprocess_with_logging(cmd): self.logger.warning("Could not write heap dump to [%s]", heap_dump_file) diff --git a/tests/telemetry_test.py b/tests/telemetry_test.py index 268899f2b..225806c18 100644 --- a/tests/telemetry_test.py +++ b/tests/telemetry_test.py @@ -236,7 +236,7 @@ def test_generates_heap_dump(self, run_subprocess_with_logging): node = cluster.Node(pid="1234", host_name="localhost", node_name="rally0", telemetry=t) t.attach_to_node(node) t.detach_from_node(node, running=True) - run_subprocess_with_logging.assert_called_with("jmap -dump:live,format=b,file=/var/log/heap_at_exit_1234.hprof 1234") + run_subprocess_with_logging.assert_called_with("jmap -dump:format=b,file=/var/log/heap_at_exit_1234.hprof 1234") class CcrStatsTests(TestCase):