Skip to content

Commit

Permalink
feat: debug metrics (#700)
Browse files Browse the repository at this point in the history
* feat: produce debug metrics for cpu profiling, get rid of text based cpu profiles

* feat: emit debug metrics

* fix: don't clean coverage

* doc: mention debug metrics
  • Loading branch information
seemk authored Mar 21, 2023
1 parent c2c2c75 commit b10f7a5
Show file tree
Hide file tree
Showing 16 changed files with 355 additions and 401 deletions.
1 change: 1 addition & 0 deletions docs/advanced-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ The following config options can be set by passing them as tracing arguments to
| n/a<br>`metrics.resourceFactory` | | Experimental | Callback which allows to filter the default resource or provide a custom one. The function takes one argument of type `Resource` which is the resource pre-filled by the SDK containing the `service.name`, environment, host and process attributes. |
| `SPLUNK_RUNTIME_METRICS_ENABLED`<br>`metrics.runtimeMetricsEnabled` | `true` | Experimental | Enable collecting and exporting of runtime metrics.
| `SPLUNK_RUNTIME_METRICS_COLLECTION_INTERVAL`<br>`metrics.runtimeMetricsCollectionIntervalMillis` | `5000` | Experimental | The interval, in milliseconds, during which GC and event loop statistics are collected. After the collection is done, the values become available to the metric exporter.
| `SPLUNK_DEBUG_METRICS_ENABLED`<br>`metrics.debugMetricsEnabled` | `false` | Experimental | Enable collection of various internal metrics (e.g. the profiler's internal performance). Only useful when troubleshooting issues and should not be switched on otherwise.

### Profiling

Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
"lint:fix": "eslint . --ext .ts --fix",
"lint": "eslint . --ext .ts",
"lint:commits": "commitlint",
"test": "cross-env TEST_ALLOW_DOUBLE_START=y nyc ts-mocha --timeout 60s --parallel --jobs 2 -p tsconfig.json 'test/**/*.test.ts'",
"test": "npm run test:unit && npm run test:debug-metrics",
"test:unit": "cross-env TEST_ALLOW_DOUBLE_START=y nyc ts-mocha --exclude 'test/separate_process/*' --timeout 60s --parallel --jobs 2 -p tsconfig.json 'test/**/*.test.ts'",
"test:debug-metrics": "nyc --no-clean ts-mocha -p tsconfig.json 'test/separate_process/debug_metrics.test.ts'",
"prebuild:current": "node scripts/prebuild-current.js",
"prebuild:os": "node scripts/prebuild-os.js",
"profile:proto": "pbjs -t static-module -w commonjs -o src/profiling/proto/profile.js protos/pprof/profile.proto",
Expand Down
119 changes: 119 additions & 0 deletions src/metrics/debug_metrics.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Copyright Splunk Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import { Histogram, metrics, Meter } from '@opentelemetry/api';
import {
ExplicitBucketHistogramAggregation,
View,
} from '@opentelemetry/sdk-metrics';

interface Meters {
meter: Meter;
cpuProfilerStartDuration: Histogram;
cpuProfilerStopDuration: Histogram;
cpuProfilerProcessingStepDuration: Histogram;
heapProfilerCollectDuration: Histogram;
heapProfilerProcessingStepDuration: Histogram;
}

let meters: Meters | undefined;

const instrumentCpuProfilerStart = 'splunk.profiler.cpu.start.duration';
const instrumentCpuProfilerStop = 'splunk.profiler.cpu.stop.duration';
const instrumentCpuProfilerProcess = 'splunk.profiler.cpu.process.duration';
const instrumentHeapProfilerCollect = 'splunk.profiler.heap.collect.duration';
const instrumentHeapProfilerProcess = 'splunk.profiler.heap.process.duration';

export function enableDebugMetrics() {
const meter = metrics.getMeter('splunk-otel-js-debug-metrics');
const opts = { unit: 'ns' };
const cpuProfilerStartDuration = meter.createHistogram(
instrumentCpuProfilerStart,
opts
);
const cpuProfilerStopDuration = meter.createHistogram(
instrumentCpuProfilerStop,
opts
);
const cpuProfilerProcessingStepDuration = meter.createHistogram(
instrumentCpuProfilerProcess,
opts
);
const heapProfilerCollectDuration = meter.createHistogram(
instrumentHeapProfilerCollect,
opts
);
const heapProfilerProcessingStepDuration = meter.createHistogram(
instrumentHeapProfilerProcess,
opts
);

meters = {
meter,
cpuProfilerStartDuration,
cpuProfilerStopDuration,
cpuProfilerProcessingStepDuration,
heapProfilerCollectDuration,
heapProfilerProcessingStepDuration,
};
}

export function recordCpuProfilerMetrics(metrics: {
profilerStartDuration: number;
profilerStopDuration: number;
profilerProcessingStepDuration: number;
}) {
if (meters === undefined) {
return;
}

meters.cpuProfilerStartDuration.record(metrics.profilerStartDuration);
meters.cpuProfilerStopDuration.record(metrics.profilerStopDuration);
meters.cpuProfilerProcessingStepDuration.record(
metrics.profilerProcessingStepDuration
);
}

export function recordHeapProfilerMetrics(metrics: {
profilerCollectDuration: number;
profilerProcessingStepDuration: number;
}) {
if (meters === undefined) {
return;
}

meters.heapProfilerCollectDuration.record(metrics.profilerCollectDuration);
meters.heapProfilerProcessingStepDuration.record(
metrics.profilerProcessingStepDuration
);
}

export function getDebugMetricsViews(): View[] {
return [
instrumentCpuProfilerStart,
instrumentCpuProfilerStop,
instrumentCpuProfilerProcess,
].map(
(instrumentName) =>
new View({
instrumentName,
aggregation: new ExplicitBucketHistogramAggregation(
[1e6, 1e8, 1e9, 1e10],
true
),
})
);
}
16 changes: 15 additions & 1 deletion src/metrics/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import {
getEnvValueByPrecedence,
getNonEmptyEnvVar,
} from '../utils';
import { enableDebugMetrics, getDebugMetricsViews } from './debug_metrics';
import * as util from 'util';
import { detect as detectResource } from '../resource';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
Expand All @@ -51,6 +52,7 @@ interface MetricsOptions {
views?: View[];
exportIntervalMillis: number;
metricReaderFactory: MetricReaderFactory;
debugMetricsEnabled: boolean;
runtimeMetricsEnabled: boolean;
runtimeMetricsCollectionIntervalMillis: number;
}
Expand Down Expand Up @@ -237,16 +239,21 @@ export const allowedMetricsOptions = [
'runtimeMetricsEnabled',
'runtimeMetricsCollectionIntervalMillis',
'serviceName',
'debugMetricsEnabled',
];

export function startMetrics(opts: StartMetricsOptions = {}) {
assertNoExtraneousProperties(opts, allowedMetricsOptions);

const options = _setDefaultOptions(opts);

const debugMetricsViews: View[] = options.debugMetricsEnabled
? getDebugMetricsViews()
: [];

const provider = new MeterProvider({
resource: options.resource,
views: options.views,
views: [...(options.views || []), ...debugMetricsViews],
});

const metricReaders = options.metricReaderFactory(options);
Expand All @@ -263,6 +270,10 @@ export function startMetrics(opts: StartMetricsOptions = {}) {
await provider.shutdown();
}

if (options.debugMetricsEnabled) {
enableDebugMetrics();
}

if (!options.runtimeMetricsEnabled) {
return {
stop: stopGlobalMetrics,
Expand Down Expand Up @@ -425,6 +436,9 @@ export function _setDefaultOptions(
exportIntervalMillis:
options.exportIntervalMillis ||
getEnvNumber('OTEL_METRIC_EXPORT_INTERVAL', 30_000),
debugMetricsEnabled:
options.debugMetricsEnabled ??
getEnvBoolean('SPLUNK_DEBUG_METRICS_ENABLED', false),
runtimeMetricsEnabled:
options.runtimeMetricsEnabled ??
getEnvBoolean('SPLUNK_RUNTIME_METRICS_ENABLED', true),
Expand Down
10 changes: 10 additions & 0 deletions src/native_ext/memory_profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,14 @@ NAN_METHOD(CollectHeapProfile) {
return;
}

int64_t allocationProfileStart = HrTime();
v8::AllocationProfile* profile = profiler->GetAllocationProfile();

if (!profile) {
return;
}

int64_t sampleProcessingStart = HrTime();
auto jsResult = Nan::New<v8::Object>();
auto jsSamples = Nan::New<v8::Array>();
auto jsNodeTree = Nan::New<v8::Object>();
Expand Down Expand Up @@ -185,11 +187,19 @@ NAN_METHOD(CollectHeapProfile) {
}
}

int64_t sampleProcessingEnd = HrTime();

Nan::Set(jsResult, Nan::New<v8::String>("treeMap").ToLocalChecked(), jsNodeTree);
Nan::Set(jsResult, Nan::New<v8::String>("samples").ToLocalChecked(), jsSamples);
Nan::Set(
jsResult, Nan::New<v8::String>("timestamp").ToLocalChecked(),
Nan::New<v8::Number>(MilliSecondsSinceEpoch()));
Nan::Set(
jsResult, Nan::New<v8::String>("profilerCollectDuration").ToLocalChecked(),
Nan::New<v8::Number>((double)(sampleProcessingStart - allocationProfileStart)));
Nan::Set(
jsResult, Nan::New<v8::String>("profilerProcessingStepDuration").ToLocalChecked(),
Nan::New<v8::Number>((double)(sampleProcessingEnd - sampleProcessingStart)));

info.GetReturnValue().Set(jsResult);

Expand Down
Loading

0 comments on commit b10f7a5

Please sign in to comment.