diff --git a/docs/telemetry.rst b/docs/telemetry.rst index 05d3c9db1..b1f011b48 100644 --- a/docs/telemetry.rst +++ b/docs/telemetry.rst @@ -21,6 +21,7 @@ You probably want to gain additional insights from a race. Therefore, we have ad jit JIT Compiler Profiler Enables JIT compiler logs. gc GC log Enables GC logs. jfr Flight Recorder Enables Java Flight Recorder (requires an Oracle JDK or OpenJDK 11+) + heapdump Heap Dump Captures a heap dump. node-stats Node Stats Regularly samples node stats recovery-stats Recovery Stats Regularly samples shard recovery stats ccr-stats CCR Stats Regularly samples Cross Cluster Replication (CCR) related stats @@ -66,6 +67,11 @@ gc The ``gc`` telemetry device enables GC logs for the benchmark candidate. You can use tools like `GCViewer `_ to analyze the GC logs. +heapdump +-------- + +The ``heapdump`` telemetry device will capture a heap dump after a benchmark has finished and right before the node is shutdown. + node-stats ---------- diff --git a/esrally/mechanic/launcher.py b/esrally/mechanic/launcher.py index 2e28ac277..e2767b1ea 100644 --- a/esrally/mechanic/launcher.py +++ b/esrally/mechanic/launcher.py @@ -163,6 +163,7 @@ def _start_node(self, node_configuration, node_count_on_host): telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), + telemetry.Heapdump(node_telemetry_dir), telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.StartupTime(self.metrics_store), diff --git a/esrally/telemetry.py b/esrally/telemetry.py index f56cef412..72cdb0a27 100644 --- a/esrally/telemetry.py +++ b/esrally/telemetry.py @@ -25,13 +25,14 @@ from esrally import metrics, time, exceptions from esrally.metrics import MetaInfoScope -from esrally.utils import io, sysstats, console, opts +from esrally.utils import io, sysstats, console, opts, process def list_telemetry(): console.println("Available telemetry devices:\n") devices = [[device.command, device.human_name, device.help] for device in [JitCompiler, Gc, FlightRecorder, - NodeStats, RecoveryStats, CcrStats]] + Heapdump, NodeStats, RecoveryStats, + CcrStats]] console.println(tabulate.tabulate(devices, ["Command", "Name", "Description"])) console.println("\nKeep in mind that each telemetry device may incur a runtime overhead which can skew results.") @@ -243,6 +244,25 @@ def java_opts(self, log_file): return ["-Xlog:gc*=info,safepoint=info,age*=trace:file={}:utctime,uptimemillis,level,tags:filecount=0".format(log_file)] +class Heapdump(TelemetryDevice): + internal = False + command = "heapdump" + human_name = "Heap Dump" + help = "Captures a heap dump." + + def __init__(self, log_root): + super().__init__() + self.log_root = log_root + + def detach_from_node(self, node, running): + if running: + heap_dump_file = os.path.join(self.log_root, "heap_at_exit_{}.hprof".format(node.pid)) + console.info("{}: Writing heap dump to [{}]".format(self.human_name, heap_dump_file), logger=self.logger) + cmd = "jmap -dump:format=b,file={} {}".format(heap_dump_file, node.pid) + if process.run_subprocess_with_logging(cmd): + self.logger.warning("Could not write heap dump to [%s]", heap_dump_file) + + class CcrStats(TelemetryDevice): internal = False command = "ccr-stats" diff --git a/tests/telemetry_test.py b/tests/telemetry_test.py index 90a62c3cb..225806c18 100644 --- a/tests/telemetry_test.py +++ b/tests/telemetry_test.py @@ -227,6 +227,18 @@ def test_sets_options_for_java_9_or_above(self): gc_java_opts) +class HeapdumpTests(TestCase): + @mock.patch("esrally.utils.process.run_subprocess_with_logging") + def test_generates_heap_dump(self, run_subprocess_with_logging): + run_subprocess_with_logging.return_value = 0 + heapdump = telemetry.Heapdump("/var/log") + t = telemetry.Telemetry(enabled_devices=[heapdump.command], devices=[heapdump]) + node = cluster.Node(pid="1234", host_name="localhost", node_name="rally0", telemetry=t) + t.attach_to_node(node) + t.detach_from_node(node, running=True) + run_subprocess_with_logging.assert_called_with("jmap -dump:format=b,file=/var/log/heap_at_exit_1234.hprof 1234") + + class CcrStatsTests(TestCase): def test_negative_sample_interval_forbidden(self): clients = {"default": Client(), "cluster_b": Client()}