From b10f7a59ee1cea479ddfae60c3debae0291e9aee Mon Sep 17 00:00:00 2001 From: Siim Kallas Date: Tue, 21 Mar 2023 17:33:44 +0200 Subject: [PATCH] feat: debug metrics (#700) * feat: produce debug metrics for cpu profiling, get rid of text based cpu profiles * feat: emit debug metrics * fix: don't clean coverage * doc: mention debug metrics --- docs/advanced-config.md | 1 + package.json | 4 +- src/metrics/debug_metrics.ts | 119 +++++++ src/metrics/index.ts | 16 +- src/native_ext/memory_profiling.cpp | 10 + src/native_ext/profiling.cpp | 330 ++------------------ src/profiling/OTLPProfilingExporter.ts | 4 +- src/profiling/index.ts | 20 +- src/profiling/types.ts | 26 +- src/profiling/utils.ts | 9 +- src/types.ts | 1 + test/metrics.test.ts | 26 +- test/profiling/extension.test.ts | 69 ++-- test/profiling/profiling.test.ts | 8 +- test/separate_process/debug_metrics.test.ts | 94 ++++++ test/utils.ts | 19 ++ 16 files changed, 355 insertions(+), 401 deletions(-) create mode 100644 src/metrics/debug_metrics.ts create mode 100644 test/separate_process/debug_metrics.test.ts diff --git a/docs/advanced-config.md b/docs/advanced-config.md index 1efa4fb4..5e186c67 100644 --- a/docs/advanced-config.md +++ b/docs/advanced-config.md @@ -104,6 +104,7 @@ The following config options can be set by passing them as tracing arguments to | n/a
`metrics.resourceFactory` | | Experimental | Callback which allows to filter the default resource or provide a custom one. The function takes one argument of type `Resource` which is the resource pre-filled by the SDK containing the `service.name`, environment, host and process attributes. | | `SPLUNK_RUNTIME_METRICS_ENABLED`
`metrics.runtimeMetricsEnabled` | `true` | Experimental | Enable collecting and exporting of runtime metrics. | `SPLUNK_RUNTIME_METRICS_COLLECTION_INTERVAL`
`metrics.runtimeMetricsCollectionIntervalMillis` | `5000` | Experimental | The interval, in milliseconds, during which GC and event loop statistics are collected. After the collection is done, the values become available to the metric exporter. +| `SPLUNK_DEBUG_METRICS_ENABLED`
`metrics.debugMetricsEnabled` | `false` | Experimental | Enable collection of various internal metrics (e.g. the profiler's internal performance). Only useful when troubleshooting issues and should not be switched on otherwise. ### Profiling diff --git a/package.json b/package.json index a31d2b5a..a6638598 100644 --- a/package.json +++ b/package.json @@ -15,7 +15,9 @@ "lint:fix": "eslint . --ext .ts --fix", "lint": "eslint . --ext .ts", "lint:commits": "commitlint", - "test": "cross-env TEST_ALLOW_DOUBLE_START=y nyc ts-mocha --timeout 60s --parallel --jobs 2 -p tsconfig.json 'test/**/*.test.ts'", + "test": "npm run test:unit && npm run test:debug-metrics", + "test:unit": "cross-env TEST_ALLOW_DOUBLE_START=y nyc ts-mocha --exclude 'test/separate_process/*' --timeout 60s --parallel --jobs 2 -p tsconfig.json 'test/**/*.test.ts'", + "test:debug-metrics": "nyc --no-clean ts-mocha -p tsconfig.json 'test/separate_process/debug_metrics.test.ts'", "prebuild:current": "node scripts/prebuild-current.js", "prebuild:os": "node scripts/prebuild-os.js", "profile:proto": "pbjs -t static-module -w commonjs -o src/profiling/proto/profile.js protos/pprof/profile.proto", diff --git a/src/metrics/debug_metrics.ts b/src/metrics/debug_metrics.ts new file mode 100644 index 00000000..4916b076 --- /dev/null +++ b/src/metrics/debug_metrics.ts @@ -0,0 +1,119 @@ +/* + * Copyright Splunk Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { Histogram, metrics, Meter } from '@opentelemetry/api'; +import { + ExplicitBucketHistogramAggregation, + View, +} from '@opentelemetry/sdk-metrics'; + +interface Meters { + meter: Meter; + cpuProfilerStartDuration: Histogram; + cpuProfilerStopDuration: Histogram; + cpuProfilerProcessingStepDuration: Histogram; + heapProfilerCollectDuration: Histogram; + heapProfilerProcessingStepDuration: Histogram; +} + +let meters: Meters | undefined; + +const instrumentCpuProfilerStart = 'splunk.profiler.cpu.start.duration'; +const instrumentCpuProfilerStop = 'splunk.profiler.cpu.stop.duration'; +const instrumentCpuProfilerProcess = 'splunk.profiler.cpu.process.duration'; +const instrumentHeapProfilerCollect = 'splunk.profiler.heap.collect.duration'; +const instrumentHeapProfilerProcess = 'splunk.profiler.heap.process.duration'; + +export function enableDebugMetrics() { + const meter = metrics.getMeter('splunk-otel-js-debug-metrics'); + const opts = { unit: 'ns' }; + const cpuProfilerStartDuration = meter.createHistogram( + instrumentCpuProfilerStart, + opts + ); + const cpuProfilerStopDuration = meter.createHistogram( + instrumentCpuProfilerStop, + opts + ); + const cpuProfilerProcessingStepDuration = meter.createHistogram( + instrumentCpuProfilerProcess, + opts + ); + const heapProfilerCollectDuration = meter.createHistogram( + instrumentHeapProfilerCollect, + opts + ); + const heapProfilerProcessingStepDuration = meter.createHistogram( + instrumentHeapProfilerProcess, + opts + ); + + meters = { + meter, + cpuProfilerStartDuration, + cpuProfilerStopDuration, + cpuProfilerProcessingStepDuration, + heapProfilerCollectDuration, + heapProfilerProcessingStepDuration, + }; +} + +export function recordCpuProfilerMetrics(metrics: { + profilerStartDuration: number; + profilerStopDuration: number; + profilerProcessingStepDuration: number; +}) { + if (meters === undefined) { + return; + } + + meters.cpuProfilerStartDuration.record(metrics.profilerStartDuration); + meters.cpuProfilerStopDuration.record(metrics.profilerStopDuration); + meters.cpuProfilerProcessingStepDuration.record( + metrics.profilerProcessingStepDuration + ); +} + +export function recordHeapProfilerMetrics(metrics: { + profilerCollectDuration: number; + profilerProcessingStepDuration: number; +}) { + if (meters === undefined) { + return; + } + + meters.heapProfilerCollectDuration.record(metrics.profilerCollectDuration); + meters.heapProfilerProcessingStepDuration.record( + metrics.profilerProcessingStepDuration + ); +} + +export function getDebugMetricsViews(): View[] { + return [ + instrumentCpuProfilerStart, + instrumentCpuProfilerStop, + instrumentCpuProfilerProcess, + ].map( + (instrumentName) => + new View({ + instrumentName, + aggregation: new ExplicitBucketHistogramAggregation( + [1e6, 1e8, 1e9, 1e10], + true + ), + }) + ); +} diff --git a/src/metrics/index.ts b/src/metrics/index.ts index 0db9eca1..4e6cfb9d 100644 --- a/src/metrics/index.ts +++ b/src/metrics/index.ts @@ -34,6 +34,7 @@ import { getEnvValueByPrecedence, getNonEmptyEnvVar, } from '../utils'; +import { enableDebugMetrics, getDebugMetricsViews } from './debug_metrics'; import * as util from 'util'; import { detect as detectResource } from '../resource'; import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions'; @@ -51,6 +52,7 @@ interface MetricsOptions { views?: View[]; exportIntervalMillis: number; metricReaderFactory: MetricReaderFactory; + debugMetricsEnabled: boolean; runtimeMetricsEnabled: boolean; runtimeMetricsCollectionIntervalMillis: number; } @@ -237,6 +239,7 @@ export const allowedMetricsOptions = [ 'runtimeMetricsEnabled', 'runtimeMetricsCollectionIntervalMillis', 'serviceName', + 'debugMetricsEnabled', ]; export function startMetrics(opts: StartMetricsOptions = {}) { @@ -244,9 +247,13 @@ export function startMetrics(opts: StartMetricsOptions = {}) { const options = _setDefaultOptions(opts); + const debugMetricsViews: View[] = options.debugMetricsEnabled + ? getDebugMetricsViews() + : []; + const provider = new MeterProvider({ resource: options.resource, - views: options.views, + views: [...(options.views || []), ...debugMetricsViews], }); const metricReaders = options.metricReaderFactory(options); @@ -263,6 +270,10 @@ export function startMetrics(opts: StartMetricsOptions = {}) { await provider.shutdown(); } + if (options.debugMetricsEnabled) { + enableDebugMetrics(); + } + if (!options.runtimeMetricsEnabled) { return { stop: stopGlobalMetrics, @@ -425,6 +436,9 @@ export function _setDefaultOptions( exportIntervalMillis: options.exportIntervalMillis || getEnvNumber('OTEL_METRIC_EXPORT_INTERVAL', 30_000), + debugMetricsEnabled: + options.debugMetricsEnabled ?? + getEnvBoolean('SPLUNK_DEBUG_METRICS_ENABLED', false), runtimeMetricsEnabled: options.runtimeMetricsEnabled ?? getEnvBoolean('SPLUNK_RUNTIME_METRICS_ENABLED', true), diff --git a/src/native_ext/memory_profiling.cpp b/src/native_ext/memory_profiling.cpp index bef1aefe..6c5313e7 100644 --- a/src/native_ext/memory_profiling.cpp +++ b/src/native_ext/memory_profiling.cpp @@ -108,12 +108,14 @@ NAN_METHOD(CollectHeapProfile) { return; } + int64_t allocationProfileStart = HrTime(); v8::AllocationProfile* profile = profiler->GetAllocationProfile(); if (!profile) { return; } + int64_t sampleProcessingStart = HrTime(); auto jsResult = Nan::New(); auto jsSamples = Nan::New(); auto jsNodeTree = Nan::New(); @@ -185,11 +187,19 @@ NAN_METHOD(CollectHeapProfile) { } } + int64_t sampleProcessingEnd = HrTime(); + Nan::Set(jsResult, Nan::New("treeMap").ToLocalChecked(), jsNodeTree); Nan::Set(jsResult, Nan::New("samples").ToLocalChecked(), jsSamples); Nan::Set( jsResult, Nan::New("timestamp").ToLocalChecked(), Nan::New(MilliSecondsSinceEpoch())); + Nan::Set( + jsResult, Nan::New("profilerCollectDuration").ToLocalChecked(), + Nan::New((double)(sampleProcessingStart - allocationProfileStart))); + Nan::Set( + jsResult, Nan::New("profilerProcessingStepDuration").ToLocalChecked(), + Nan::New((double)(sampleProcessingEnd - sampleProcessingStart))); info.GetReturnValue().Set(jsResult); diff --git a/src/native_ext/profiling.cpp b/src/native_ext/profiling.cpp index 8afba075..edbfcc4a 100644 --- a/src/native_ext/profiling.cpp +++ b/src/native_ext/profiling.cpp @@ -386,166 +386,6 @@ NAN_METHOD(StartProfiling) { profiling->sampleCutoffPoint = HrTime(); } -struct StringBuilder { - StringBuilder(char* buffer, size_t length) : buffer(buffer), capacity(length) {} - - size_t Add(const char* s, size_t length) { - memcpy(buffer + offset, s, length); - offset += length; - return offset; - } - - size_t Add(int32_t value) { - size_t digitSize = modp_uitoa10(value, buffer + offset); - offset += digitSize; - return offset; - } - - size_t Add(char c) { - buffer[offset++] = c; - return offset; - } - - char* buffer; - size_t capacity; - size_t offset = 0; -}; - -size_t RemoveParen(char* content, size_t length) { - size_t size = 0; - for (size_t i = 0; i < length; i++) { - char c = content[i]; - - if (c == '(' || c == ')') { - continue; - } - - content[size++] = c; - } - - return size; -} - -String NewStackLine(PagedArena* arena, const v8::CpuProfileNode* node) { - const char* rawFunction = node->GetFunctionNameStr(); - const char* rawFileName = node->GetScriptResourceNameStr(); - size_t functionLen = strlen(rawFunction); - size_t fileNameLen = strlen(rawFileName); - - if (functionLen == 0) { - rawFunction = "anonymous"; - functionLen = 9; - } - - if (fileNameLen == 0) { - rawFileName = "unknown"; - fileNameLen = 7; - } - - const size_t extraLength = 8; - const size_t lineNoLength = 16; - const size_t bytesNeeded = functionLen + fileNameLen + extraLength + lineNoLength; - - char* content = (char*)PagedArenaAlloc(arena, bytesNeeded); - - if (!content) { - return String(); - } - - StringBuilder builder(content, bytesNeeded); - builder.Add(rawFunction, functionLen); - builder.offset = RemoveParen(content, functionLen); - builder.Add('('); - builder.Add(rawFileName, fileNameLen); - builder.Add(':'); - builder.Add(node->GetLineNumber()); - builder.Add(':'); - builder.Add(node->GetColumnNumber()); - builder.Add(')'); - builder.Add('\n'); - - return String(content, builder.offset); -} - -struct StacktraceBuilder { - struct StackLines { - static const int32_t kMaxLines = 32; - String lines[kMaxLines]; - int32_t count = 0; - StackLines* next = nullptr; - }; - - StacktraceBuilder(PagedArena* arena, StackLineCache* cache) - : arena(arena), cache(cache), lines(&entry) {} - - void Add(const v8::CpuProfileNode* node) { - String line = cache->Get(node->GetNodeId()); - - if (line.IsEmpty()) { - line = NewStackLine(arena, node); - cache->Set(node->GetNodeId(), line); - } - - lines->lines[lines->count++] = line; - - if (lines->count == StackLines::kMaxLines) { - StackLines* newBuffer = (StackLines*)PagedArenaAlloc(arena, sizeof(StackLines)); - - if (!newBuffer) { - return; - } - - lines->next = newBuffer; - lines = newBuffer; - } - } - - String Build() { - static const char prefix[] = "\n\n"; - - size_t bytesNeeded = sizeof(prefix) - 1; - - { - StackLines* l = &entry; - - while (l) { - for (int32_t i = 0; i < l->count; i++) { - String* line = &l->lines[i]; - bytesNeeded += line->length; - } - - l = l->next; - } - } - - char* dest = (char*)PagedArenaAlloc(arena, bytesNeeded); - - if (!dest) { - return String(); - } - - StringBuilder builder(dest, bytesNeeded); - builder.Add(prefix, sizeof(prefix) - 1); - - StackLines* l = &entry; - while (l) { - for (int32_t i = 0; i < l->count; i++) { - String* line = &l->lines[i]; - builder.Add(line->data, line->length); - } - - l = l->next; - } - - return String(dest, builder.offset); - } - - PagedArena* arena; - StackLineCache* cache; - StackLines* lines; - StackLines entry; -}; - size_t TimestampString(int64_t ts, char* out) { return modp_litoa10(ts, out); } bool ShouldIncludeSample(Profiling* profiling, int64_t sampleTimestamp) { @@ -632,112 +472,6 @@ void ProfilingRecordDebugInfo(Profiling* profiling, v8::Local profil #endif } -void ProfilingBuildStacktraces( - Profiling* profiling, v8::CpuProfile* profile, v8::Local profilingData) { - auto jsTraces = Nan::New(); - Nan::Set(profilingData, Nan::New("stacktraces").ToLocalChecked(), jsTraces); - - char startTimeNanos[32]; - size_t startTimeNanosLen = TimestampString(profiling->wallStartTime, startTimeNanos); - - Nan::Set( - profilingData, Nan::New("startTimeNanos").ToLocalChecked(), - Nan::New(startTimeNanos, startTimeNanosLen).ToLocalChecked()); - -#if PROFILER_DEBUG_EXPORT - { - char tpBuf[32]; - size_t tpLen = TimestampString(profiling->startTime, tpBuf); - Nan::Set( - profilingData, Nan::New("startTimepoint").ToLocalChecked(), - Nan::New(tpBuf, tpLen).ToLocalChecked()); - } -#endif - - int32_t traceCount = 0; - int64_t nextSampleTs = profile->GetStartTime() * 1000LL; - for (int i = 0; i < profile->GetSamplesCount(); i++) { - int64_t monotonicTs = profile->GetSampleTimestamp(i) * 1000LL; - - bool isValidSample = ShouldIncludeSample(profiling, monotonicTs) && monotonicTs >= nextSampleTs; - if (!isValidSample) { - continue; - } - - nextSampleTs += profiling->samplingIntervalNanos; - - const v8::CpuProfileNode* sample = profile->GetSample(i); - StacktraceBuilder builder(&profiling->arena, &profiling->stacklineCache); - builder.Add(sample); - - int64_t monotonicDelta = monotonicTs - profiling->startTime; - int64_t sampleTimestamp = profiling->wallStartTime + monotonicDelta; - - // TODO: Node <12.5 does not have GetParent, so we'd need to traverse the tree top down instead. -#if NODE_VERSION_AT_LEAST(12, 5, 0) - const v8::CpuProfileNode* parent = sample->GetParent(); - while (parent) { - const v8::CpuProfileNode* next = parent->GetParent(); - - // Skip the root node as it does not contain useful information. - if (next) { - builder.Add(parent); - } - - parent = next; - } -#endif - - String stacktrace = builder.Build(); - - if (stacktrace.IsEmpty()) { - continue; - } - - char tsBuf[32]; - size_t tsLen = TimestampString(sampleTimestamp, tsBuf); - - auto jsTrace = Nan::New(); - - Nan::Set( - jsTrace, Nan::New("timestamp").ToLocalChecked(), - Nan::New(tsBuf, tsLen).ToLocalChecked()); - Nan::Set( - jsTrace, Nan::New("stacktrace").ToLocalChecked(), - Nan::New(stacktrace.data, stacktrace.length).ToLocalChecked()); - -#if PROFILER_DEBUG_EXPORT - char tpBuf[32]; - size_t tpLen = TimestampString(monotonicTs, tpBuf); - Nan::Set( - jsTrace, Nan::New("timepoint").ToLocalChecked(), - Nan::New(tpBuf, tpLen).ToLocalChecked()); -#endif - - SpanActivation* match = FindClosestActivation(profiling, monotonicTs); - - if (match) { - uint8_t spanId[8]; - uint8_t traceId[16]; - HexToBinary(match->spanId, 16, spanId, sizeof(spanId)); - HexToBinary(match->traceId, 32, traceId, sizeof(traceId)); - - Nan::Set( - jsTrace, Nan::New("spanId").ToLocalChecked(), - Nan::CopyBuffer((const char*)spanId, 8).ToLocalChecked()); - Nan::Set( - jsTrace, Nan::New("traceId").ToLocalChecked(), - Nan::CopyBuffer((const char*)traceId, 16).ToLocalChecked()); - -#if PROFILER_DEBUG_EXPORT - match->is_intersected = true; -#endif - } - - Nan::Set(jsTraces, traceCount++, jsTrace); - } -} - v8::Local makeStackLine(const v8::CpuProfileNode* node) { auto jsResult = Nan::New(); @@ -760,7 +494,7 @@ v8::Local makeStackLine(const v8::CpuProfileNode* node) { return jsResult; } -void ProfilingBuildRawStacktraces( +void ProfilingBuildStacktraces( Profiling* profiling, v8::CpuProfile* profile, v8::Local profilingData) { auto jsTraces = Nan::New(); Nan::Set(profilingData, Nan::New("stacktraces").ToLocalChecked(), jsTraces); @@ -881,12 +615,18 @@ NAN_METHOD(CollectProfilingData) { ProfileTitle(profiling->profilerSeq, nextTitle, sizeof(nextTitle)); profiling->activationDepth = 0; - int64_t newWallStart = MicroSecondsSinceEpoch() * 1000L; int64_t newStartTime = HrTime(); + int64_t newWallStart = MicroSecondsSinceEpoch() * 1000L; V8StartProfiling(profiling->profiler, nextTitle); + int64_t profilerStopBegin = HrTime(); + int64_t profilerStartDuration = profilerStopBegin - newStartTime; + v8::CpuProfile* profile = profiling->profiler->StopProfiling(Nan::New(prevTitle).ToLocalChecked()); + int64_t profilerStopEnd = HrTime(); + int64_t profilerStopDuration = profilerStopEnd - profilerStopBegin; + if (!profile) { // profile with this title might've already be ended using a previous stop call profiling->startTime = newStartTime; @@ -898,46 +638,18 @@ NAN_METHOD(CollectProfilingData) { info.GetReturnValue().Set(jsProfilingData); ProfilingBuildStacktraces(profiling, profile, jsProfilingData); - ProfilingRecordDebugInfo(profiling, jsProfilingData); - ProfilingReset(profiling); - profile->Delete(); - - profiling->startTime = newStartTime; - profiling->wallStartTime = newWallStart; - profiling->sampleCutoffPoint = HrTime(); -} - -NAN_METHOD(CollectProfilingDataRaw) { - info.GetReturnValue().SetNull(); - if (!profiling) { - return; - } - - char prevTitle[64]; - ProfileTitle(profiling->profilerSeq, prevTitle, sizeof(prevTitle)); - profiling->profilerSeq++; - char nextTitle[64]; - ProfileTitle(profiling->profilerSeq, nextTitle, sizeof(nextTitle)); - - profiling->activationDepth = 0; - int64_t newStartTime = HrTime(); - int64_t newWallStart = MicroSecondsSinceEpoch() * 1000L; - - V8StartProfiling(profiling->profiler, nextTitle); - - v8::CpuProfile* profile = - profiling->profiler->StopProfiling(Nan::New(prevTitle).ToLocalChecked()); - if (!profile) { - // profile with this title might've already be ended using a previous stop call - profiling->startTime = newStartTime; - profiling->wallStartTime = newWallStart; - return; - } + int64_t profilerProcessingStepDuration = HrTime() - profilerStopEnd; - auto jsProfilingData = Nan::New(); - info.GetReturnValue().Set(jsProfilingData); + Nan::Set( + jsProfilingData, Nan::New("profilerStartDuration").ToLocalChecked(), + Nan::New((double)profilerStartDuration)); + Nan::Set( + jsProfilingData, Nan::New("profilerStopDuration").ToLocalChecked(), + Nan::New((double)profilerStopDuration)); + Nan::Set( + jsProfilingData, Nan::New("profilerProcessingStepDuration").ToLocalChecked(), + Nan::New((double)profilerProcessingStepDuration)); - ProfilingBuildRawStacktraces(profiling, profile, jsProfilingData); ProfilingRecordDebugInfo(profiling, jsProfilingData); ProfilingReset(profiling); profile->Delete(); @@ -966,7 +678,7 @@ NAN_METHOD(StopProfiling) { auto jsProfilingData = Nan::New(); info.GetReturnValue().Set(jsProfilingData); - ProfilingBuildRawStacktraces(profiling, profile, jsProfilingData); + ProfilingBuildStacktraces(profiling, profile, jsProfilingData); ProfilingRecordDebugInfo(profiling, jsProfilingData); ProfilingReset(profiling); profile->Delete(); @@ -1096,10 +808,6 @@ void Initialize(v8::Local target) { profilingModule, Nan::New("collect").ToLocalChecked(), Nan::GetFunction(Nan::New(CollectProfilingData)).ToLocalChecked()); - Nan::Set( - profilingModule, Nan::New("collectRaw").ToLocalChecked(), - Nan::GetFunction(Nan::New(CollectProfilingDataRaw)).ToLocalChecked()); - Nan::Set( profilingModule, Nan::New("enterContext").ToLocalChecked(), Nan::GetFunction(Nan::New(EnterContext)).ToLocalChecked()); diff --git a/src/profiling/OTLPProfilingExporter.ts b/src/profiling/OTLPProfilingExporter.ts index e814e01a..28334a3e 100644 --- a/src/profiling/OTLPProfilingExporter.ts +++ b/src/profiling/OTLPProfilingExporter.ts @@ -16,7 +16,7 @@ import * as protoLoader from '@grpc/proto-loader'; import * as grpc from '@grpc/grpc-js'; import * as path from 'path'; -import { HeapProfile, RawProfilingData, ProfilingExporter } from './types'; +import { CpuProfile, HeapProfile, ProfilingExporter } from './types'; import { diag } from '@opentelemetry/api'; import { Resource } from '@opentelemetry/resources'; import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions'; @@ -123,7 +123,7 @@ export class OTLPProfilingExporter implements ProfilingExporter { } } - send(profile: RawProfilingData) { + send(profile: CpuProfile) { const { stacktraces } = profile; diag.debug(`profiling: Exporting ${stacktraces?.length} samples`); const { callstackInterval } = this._options; diff --git a/src/profiling/index.ts b/src/profiling/index.ts index c9a3f529..67a7d55c 100644 --- a/src/profiling/index.ts +++ b/src/profiling/index.ts @@ -23,6 +23,10 @@ import { getEnvNumber, getNonEmptyEnvVar, } from '../utils'; +import { + recordCpuProfilerMetrics, + recordHeapProfilerMetrics, +} from '../metrics/debug_metrics'; import { detect as detectResource } from '../resource'; import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions'; import { @@ -72,9 +76,9 @@ function extCollectHeapProfile( return extension.collectHeapProfile(); } -function extCollectSamples(extension: ProfilingExtension) { - diag.debug('profiling: Collecting samples'); - return extension.collectRaw(); +function extCollectCpuProfile(extension: ProfilingExtension) { + diag.debug('profiling: Collecting CPU profile'); + return extension.collect(); } export function defaultExporterFactory( @@ -133,11 +137,13 @@ export function startProfiling(opts: StartProfilingOptions = {}) { extStartProfiling(extension, startOptions); const cpuSamplesCollectInterval = setInterval(() => { - const profilingData = extCollectSamples(extension); + const cpuProfile = extCollectCpuProfile(extension); + + if (cpuProfile) { + recordCpuProfilerMetrics(cpuProfile); - if (profilingData) { for (const exporter of exporters) { - exporter.send(profilingData); + exporter.send(cpuProfile); } } }, options.collectionDuration); @@ -150,6 +156,8 @@ export function startProfiling(opts: StartProfilingOptions = {}) { memSamplesCollectInterval = setInterval(() => { const heapProfile = extCollectHeapProfile(extension); if (heapProfile) { + recordHeapProfilerMetrics(heapProfile); + for (const exporter of exporters) { exporter.sendHeapProfile(heapProfile); } diff --git a/src/profiling/types.ts b/src/profiling/types.ts index ec7095e5..5f9fa099 100644 --- a/src/profiling/types.ts +++ b/src/profiling/types.ts @@ -20,22 +20,26 @@ export interface ProfilingStartOptions { recordDebugInfo: boolean; } -interface GenericProfilingStacktrace { +export interface ProfilingStacktrace { /** Timestamp of the sample (nanoseconds since Unix epoch). */ timestamp: string; /** Formatted stacktrace. */ - stacktrace: StackTraceType; + stacktrace: ProfilingStackFrame[]; traceId: Buffer; spanId: Buffer; } -interface GenericProfilingData { +export interface CpuProfile { /** Timestamp when profiling was started (nanoseconds since Unix epoch). */ startTimeNanos: string; - stacktraces: GenericProfilingStacktrace[]; + stacktraces: ProfilingStacktrace[]; + + profilerStartDuration: number; + profilerStopDuration: number; + profilerProcessingStepDuration: number; } -export interface RawProfilingStackFrame extends Array { +export interface ProfilingStackFrame extends Array { /** filename */ 0: string; /** function name */ @@ -46,9 +50,6 @@ export interface RawProfilingStackFrame extends Array { 3: number; } -export type RawProfilingData = GenericProfilingData; -export type ProfilingData = GenericProfilingData; - export interface HeapProfileNode { name: string; scriptName: string; @@ -65,13 +66,14 @@ export interface HeapProfile { samples: AllocationSample[]; treeMap: { [nodeId: string]: HeapProfileNode }; timestamp: number; + profilerCollectDuration: number; + profilerProcessingStepDuration: number; } export interface ProfilingExtension { start(options?: ProfilingStartOptions): void; - stop(): RawProfilingData; - collect(): ProfilingData; - collectRaw(): RawProfilingData; + stop(): CpuProfile; + collect(): CpuProfile; enterContext(context: unknown, traceId: string, spanId: string): void; exitContext(context: unknown): void; startMemoryProfiling(options?: MemoryProfilingOptions): void; @@ -103,7 +105,7 @@ export interface ProfilingOptions { export type StartProfilingOptions = Partial; export interface ProfilingExporter { - send(profile: RawProfilingData): void; + send(profile: CpuProfile): void; sendHeapProfile(profile: HeapProfile): void; } diff --git a/src/profiling/utils.ts b/src/profiling/utils.ts index 9083c088..19e43440 100644 --- a/src/profiling/utils.ts +++ b/src/profiling/utils.ts @@ -21,7 +21,7 @@ import { diag } from '@opentelemetry/api'; import { getNonEmptyEnvVar } from '../utils'; import { perftools } from './proto/profile'; -import type { HeapProfile, RawProfilingData } from './types'; +import type { CpuProfile, HeapProfile } from './types'; const gzipPromise = promisify(gzip); @@ -143,10 +143,7 @@ class Serializer { }); } - serializeCpuProfile( - profile: RawProfilingData, - options: PProfSerializationOptions - ) { + serializeCpuProfile(profile: CpuProfile, options: PProfSerializationOptions) { const { stacktraces } = profile; const STR = { @@ -207,7 +204,7 @@ class Serializer { } export const serialize = ( - profile: RawProfilingData, + profile: CpuProfile, options: PProfSerializationOptions ) => { return new Serializer().serializeCpuProfile(profile, options); diff --git a/src/types.ts b/src/types.ts index 5f5e544e..62436e33 100644 --- a/src/types.ts +++ b/src/types.ts @@ -36,6 +36,7 @@ export type EnvVarKey = | 'OTEL_TRACES_EXPORTER' | 'SPLUNK_ACCESS_TOKEN' | 'SPLUNK_AUTOINSTRUMENT_PACKAGE_NAMES' + | 'SPLUNK_DEBUG_METRICS_ENABLED' | 'SPLUNK_METRICS_ENABLED' | 'SPLUNK_METRICS_ENDPOINT' | 'SPLUNK_PROFILER_CALL_STACK_INTERVAL' diff --git a/test/metrics.test.ts b/test/metrics.test.ts index c78a4560..226cc973 100644 --- a/test/metrics.test.ts +++ b/test/metrics.test.ts @@ -29,7 +29,7 @@ import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-grpc'; import { OTLPMetricExporter as OTLPHttpProtoMetricExporter } from '@opentelemetry/exporter-metrics-otlp-proto'; import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions'; -import * as utils from './utils'; +import { cleanEnvironment, TestMetricReader } from './utils'; import { hrtime } from 'process'; import { startMetrics, _setDefaultOptions } from '../src/metrics'; @@ -59,19 +59,6 @@ const emptyStats = () => ({ }, }); -class TestMetricReader extends MetricReader { - constructor(public temporality: AggregationTemporality) { - super(); - } - selectAggregationTemporality( - instrumentType: InstrumentType - ): AggregationTemporality { - return this.temporality; - } - protected async onForceFlush() {} - protected async onShutdown() {} -} - describe('metrics', () => { describe('native counters collection', () => { const { metrics } = require('../src/native_ext'); @@ -116,8 +103,8 @@ describe('metrics', () => { }); describe('options', () => { - beforeEach(utils.cleanEnvironment); - after(utils.cleanEnvironment); + beforeEach(cleanEnvironment); + after(cleanEnvironment); it('has expected defaults', () => { const options = _setDefaultOptions(); @@ -135,6 +122,7 @@ describe('metrics', () => { OTLPMetricExporter, 'Expected the default metric exporter to be OTLP gRPC' ); + assert.deepEqual(options.debugMetricsEnabled, false); }); it('is possible to set options via env vars', () => { @@ -144,6 +132,7 @@ describe('metrics', () => { process.env.OTEL_RESOURCE_ATTRIBUTES = 'key1=val1,key2=val2'; process.env.SPLUNK_RUNTIME_METRICS_ENABLED = 'true'; process.env.SPLUNK_RUNTIME_METRICS_COLLECTION_INTERVAL = '1200'; + process.env.SPLUNK_DEBUG_METRICS_ENABLED = 'true'; const options = _setDefaultOptions(); assert.deepEqual(options.serviceName, 'bigmetric'); @@ -157,6 +146,7 @@ describe('metrics', () => { ); assert.deepEqual(options.runtimeMetricsEnabled, true); assert.deepEqual(options.runtimeMetricsCollectionIntervalMillis, 1200); + assert.deepEqual(options.debugMetricsEnabled, true); }); }); @@ -164,11 +154,11 @@ describe('metrics', () => { let reader: TestMetricReader; beforeEach(() => { - utils.cleanEnvironment(); + cleanEnvironment(); reader = new TestMetricReader(AggregationTemporality.CUMULATIVE); }); - after(utils.cleanEnvironment); + after(cleanEnvironment); // Custom metrics and runtime metrics are done with 1 test as OTel meter provider can't be reset it('is possible to use metrics', async () => { diff --git a/test/profiling/extension.test.ts b/test/profiling/extension.test.ts index 5cb5e2d1..fbd22b59 100644 --- a/test/profiling/extension.test.ts +++ b/test/profiling/extension.test.ts @@ -45,7 +45,7 @@ describe('profiling native extension', () => { assert.equal(extension.stop(), null); }); - it('is possible to collect stacktraces', () => { + it('is possible to collect a cpu profile', () => { // returns null if no profiling started assert.equal(extension.collect(), null); @@ -55,7 +55,7 @@ describe('profiling native extension', () => { recordDebugInfo: false, }); - utils.spinMs(1_000); + utils.spinMs(200); const result = extension.collect(); // The types might not be what is declared in typescript, a sanity check. @@ -63,46 +63,22 @@ describe('profiling native extension', () => { const { stacktraces, startTimeNanos } = result; assertNanoSecondString(startTimeNanos); + assert.strictEqual(typeof result.profilerStartDuration, 'number'); + assert.strictEqual(typeof result.profilerStopDuration, 'number'); + assert.strictEqual(typeof result.profilerProcessingStepDuration, 'number'); + assert( - stacktraces.length >= expectedStacktraceCount, - `expected at least ${expectedStacktraceCount} stacktraces, got ${stacktraces.length}` + result.profilerStartDuration > 0, + 'expected profilerStartDuration > 0' + ); + assert( + result.profilerStopDuration > 0, + 'expected profilerStopDuration > 0' + ); + assert( + result.profilerProcessingStepDuration > 0, + 'expected profilerProcessingDuration > 0' ); - - for (const { stacktrace, timestamp } of stacktraces) { - // Don't bother checking for span and trace ID here. - assert.equal(typeof stacktrace, 'string'); - assertNanoSecondString(timestamp); - - // The first two lines are intentionally empty, - // as we don't have information about the thread state. - const lines = stacktrace.split('\n'); - assert.deepEqual(lines[0], ''); - assert.deepEqual(lines[1], ''); - const stacklines = lines.slice(2, -1); - - for (const stackline of stacklines) { - assert(/.+\(.+:\d+:\d+\)/.test(stackline), stackline); - } - } - }); - - it('is possible to collect raw data on stacktraces', () => { - // returns null if no profiling started - assert.equal(extension.collectRaw(), null); - - // Use a lower interval to make sure we capture something - extension.start({ - samplingIntervalMicroseconds: 1_000, - recordDebugInfo: false, - }); - - utils.spinMs(200); - - const result = extension.collectRaw(); - // The types might not be what is declared in typescript, a sanity check. - assert.equal(typeof result, 'object'); - const { stacktraces, startTimeNanos } = result; - assertNanoSecondString(startTimeNanos); assert( stacktraces.length >= expectedStacktraceCount, @@ -151,6 +127,19 @@ describe('profiling native extension', () => { 'not a feasible heap profile timestamp' ); + assert.strictEqual(typeof profile.profilerCollectDuration, 'number'); + assert.strictEqual(typeof profile.profilerProcessingStepDuration, 'number'); + + assert( + profile.profilerCollectDuration > 0, + 'expected profilerCollectDuration > 0' + ); + + assert( + profile.profilerProcessingStepDuration > 0, + 'expected profilerProcessingDuration > 0' + ); + const { treeMap, samples } = profile; assert(samples.length > 0, 'no allocation samples'); diff --git a/test/profiling/profiling.test.ts b/test/profiling/profiling.test.ts index 3e0641cf..fab9ea58 100644 --- a/test/profiling/profiling.test.ts +++ b/test/profiling/profiling.test.ts @@ -30,9 +30,9 @@ import { } from '../../src/profiling'; import { start, stop } from '../../src'; import { + CpuProfile, HeapProfile, ProfilingExporter, - RawProfilingData, } from '../../src/profiling/types'; import { ProfilingContextManager } from '../../src/profiling/ProfilingContextManager'; import { detect as detectResource } from '../../src/resource'; @@ -97,8 +97,8 @@ describe('profiling', () => { let sendCallCount = 0; const stacktracesReceived = []; const exporter: ProfilingExporter = { - send(profilingData: RawProfilingData) { - const { stacktraces } = profilingData; + send(cpuProfile: CpuProfile) { + const { stacktraces } = cpuProfile; sendCallCount += 1; stacktracesReceived.push(...stacktraces); }, @@ -160,7 +160,7 @@ describe('profiling', () => { it('exports heap profiles', async () => { let sendCallCount = 0; const exporter: ProfilingExporter = { - send(_profilingData: RawProfilingData) {}, + send(_cpuProfile: CpuProfile) {}, sendHeapProfile(profile: HeapProfile) { sendCallCount += 1; }, diff --git a/test/separate_process/debug_metrics.test.ts b/test/separate_process/debug_metrics.test.ts new file mode 100644 index 00000000..74ed190b --- /dev/null +++ b/test/separate_process/debug_metrics.test.ts @@ -0,0 +1,94 @@ +/* + * Copyright Splunk Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import * as assert from 'assert'; +import { cleanEnvironment, TestMetricReader } from '../utils'; +import { + AggregationTemporality, + MetricReader, +} from '@opentelemetry/sdk-metrics'; +import { startMetrics } from '../../src/metrics'; +import { start, stop } from '../../src'; + +describe('debug metrics', () => { + let reader: TestMetricReader; + + beforeEach(() => { + cleanEnvironment(); + reader = new TestMetricReader(AggregationTemporality.CUMULATIVE); + }); + + afterEach(async () => { + await stop(); + }); + + it('is possible to use metrics', async () => { + start({ + serviceName: 'debugmetrics', + tracing: false, + profiling: { + collectionDuration: 100, + callstackInterval: 10, + memoryProfilingEnabled: true, + exporterFactory: () => { + return [ + { + send: () => {}, + sendHeapProfile: () => {}, + }, + ]; + }, + }, + metrics: { + debugMetricsEnabled: true, + metricReaderFactory: () => { + return [reader]; + }, + }, + }); + + await new Promise((resolve) => setTimeout(resolve, 1_000)); + + const metricData = await reader.collect(); + const { metrics: debugMetrics } = + metricData.resourceMetrics.scopeMetrics.find( + (sm) => sm.scope.name === 'splunk-otel-js-debug-metrics' + ); + + const allowedNames = new Set([ + 'splunk.profiler.cpu.start.duration', + 'splunk.profiler.cpu.stop.duration', + 'splunk.profiler.cpu.process.duration', + 'splunk.profiler.heap.collect.duration', + 'splunk.profiler.heap.process.duration', + ]); + + assert.deepStrictEqual(debugMetrics.length, allowedNames.size); + + for (const { descriptor, dataPoints } of debugMetrics) { + assert( + allowedNames.has(descriptor.name), + `invalid metric name ${descriptor.name}` + ); + assert.deepStrictEqual(descriptor.type, 'HISTOGRAM'); + assert.deepStrictEqual(descriptor.unit, 'ns'); + assert( + dataPoints[0].value['count'] > 0, + 'expected datapoint count to be more than 0' + ); + } + }); +}); diff --git a/test/utils.ts b/test/utils.ts index 8cf06909..7513ab24 100644 --- a/test/utils.ts +++ b/test/utils.ts @@ -14,6 +14,12 @@ * limitations under the License. */ +import { + AggregationTemporality, + InstrumentType, + MetricReader, +} from '@opentelemetry/sdk-metrics'; + const isConfigVarEntry = (key) => { const lowercased = key.toLowerCase(); return ( @@ -44,3 +50,16 @@ export const spinMs = (ms: number) => { const start = Date.now(); while (Date.now() - start < ms) {} }; + +export class TestMetricReader extends MetricReader { + constructor(public temporality: AggregationTemporality) { + super(); + } + selectAggregationTemporality( + instrumentType: InstrumentType + ): AggregationTemporality { + return this.temporality; + } + protected async onForceFlush() {} + protected async onShutdown() {} +}