Skip to content

Commit

Permalink
feat(prometheus,shred-collector): metrics improvements + add metrics …
Browse files Browse the repository at this point in the history
…to shred collector (#306)

* feat(prometheus,shred-collector): metrics improvements + add metrics to shred collector

* refactor: remove unused imports

* feat(prometheus): use labels for variants in variant counter, instead of separate metrics

* refactor(prometheus): remove unused VariantIndex.get

* refactor(prometheus): use standard import style

* refactor(geyser): use metrics prefix for reader and writer

* docs(prometheus): fix typos/inaccuracies in VariantIndexer docs

* feat(prometheus): more descriptive histogram bucket configuration and error messages

* refactor(shred-collector): rename count variable to packet_count

* refactor(shred-collector): more descriptive variable names in shred receiver

* refactor(shred-collector): remove todo about metrics

* refactor: rename Stats structs to Metrics

* docs(contributing): add metrics naming to style guide

* fix: naming

---------

Co-authored-by: x19 <100000306+0xNineteen@users.noreply.github.com>
  • Loading branch information
dnut and 0xNineteen authored Oct 10, 2024
1 parent 300024a commit 9c21b67
Show file tree
Hide file tree
Showing 25 changed files with 879 additions and 428 deletions.
13 changes: 13 additions & 0 deletions docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,16 @@ fn do_something(maybe_foo: ?Foo) void {
### Linting

- run `zig fmt src/` in the top-level directory to run the zig linter


### Metrics

It's common for a single context to deal with multiple prometheus metrics. In that case, it may be useful to group them together in their own struct. Any metrics structs following this pattern must end their names with the word `Metrics`. If practical, `Registry.initStruct` or `Registry.initFields` is recommended for the struct's initialization.

The name of the struct should be prefixed with a name describing the scope where it is used, as in `GossipMetrics`. The prefix may be omitted if the context is already obvious, and the following are true about the metrics struct:
- It is a private const.
- It is the only metrics struct defined in that context.
- It is expected to contain all metrics that would ever be used in any context where it the struct is accessible.
- The context's name would be redundantly expressed in its name. For example, shred_processor.zig encloses a metrics struct, for which the only sensible names would be ShredProcessorMetrics or Metrics. In this case, the context would be redundantly expressed, so it may be omitted.

If the Metrics struct is simple, and only used within one struct, it can be defined within that struct, otherwise it can be defined directly underneath the struct.
186 changes: 84 additions & 102 deletions src/accountsdb/db.zig
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const Level = sig.trace.level.Level;
const NestedHashTree = sig.common.merkle_tree.NestedHashTree;
const GetMetricError = sig.prometheus.registry.GetMetricError;
const Counter = sig.prometheus.counter.Counter;
const Gauge = sig.prometheus.Gauge;
const Histogram = sig.prometheus.histogram.Histogram;
const ClientVersion = sig.version.ClientVersion;
const StatusCache = sig.accounts_db.StatusCache;
Expand All @@ -57,86 +58,6 @@ pub const ACCOUNT_INDEX_BINS: usize = 8192;
pub const ACCOUNT_FILE_SHRINK_THRESHOLD = 70; // shrink account files with more than X% dead bytes
pub const DELETE_ACCOUNT_FILES_MIN = 100;

pub const AccountsDBStats = struct {
const HistogramKind = enum {
flush_account_file_size,
shrink_file_shrunk_by,
shrink_alive_accounts,
shrink_dead_accounts,
time_flush,
time_clean,
time_shrink,
time_purge,

fn buckets(self: HistogramKind) []const f64 {
const account_size_buckets = &.{
// 10 bytes -> 10MB (solana max account size)
10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000,
};
const account_count_buckets = &.{ 1, 5, 10, 50, 100, 500, 1_000, 10_000 };
const nanosecond_buckets = &.{
// 0.01ms -> 10ms
1_000, 10_000, 100_000, 1_000_000, 10_000_000,
// 50ms -> 1000ms
50_000_000, 100_000_000, 200_000_000, 400_000_000, 1_000_000_000,
};

return switch (self) {
.flush_account_file_size, .shrink_file_shrunk_by => account_size_buckets,
.shrink_alive_accounts, .shrink_dead_accounts => account_count_buckets,
.time_flush, .time_clean, .time_shrink, .time_purge => nanosecond_buckets,
};
}
};

number_files_flushed: *Counter,
number_files_cleaned: *Counter,
number_files_shrunk: *Counter,
number_files_deleted: *Counter,

time_flush: *Histogram,
time_clean: *Histogram,
time_shrink: *Histogram,
time_purge: *Histogram,

flush_account_file_size: *Histogram,
flush_accounts_written: *Counter,

clean_references_deleted: *Counter,
clean_files_queued_deletion: *Counter,
clean_files_queued_shrink: *Counter,
clean_slot_old_state: *Counter,
clean_slot_zero_lamports: *Counter,

shrink_file_shrunk_by: *Histogram,
shrink_alive_accounts: *Histogram,
shrink_dead_accounts: *Histogram,

const Self = @This();

pub fn init() GetMetricError!Self {
var self: Self = undefined;
const registry = globalRegistry();
const stats_struct_info = @typeInfo(Self).Struct;
inline for (stats_struct_info.fields) |field| {
@field(self, field.name) = switch (field.type) {
*Counter => try registry.getOrCreateCounter(field.name),
*Histogram => blk: {
@setEvalBranchQuota(2000); // stringToEnum requires a little more than default
const histogram_kind = comptime std.meta.stringToEnum(
HistogramKind,
field.name,
) orelse @compileError("no matching HistogramKind for AccountsDBStats *Histogram field");

break :blk try registry.getOrCreateHistogram(field.name, histogram_kind.buckets());
},
else => @compileError("Unsupported field type"),
};
}
return self;
}
};

/// database for accounts
///
/// Analogous to [AccountsDb](https://github.com/anza-xyz/agave/blob/4c921ca276bbd5997f809dec1dd3937fb06463cc/accounts-db/src/accounts_db.rs#L1363)
Expand Down Expand Up @@ -190,7 +111,7 @@ pub const AccountsDB = struct {
// TODO: move to Bank struct
bank_hash_stats: RwMux(BankHashStatsMap) = RwMux(BankHashStatsMap).init(.{}),

stats: AccountsDBStats,
metrics: AccountsDBMetrics,
logger: Logger,
config: InitConfig,

Expand Down Expand Up @@ -253,8 +174,7 @@ pub const AccountsDB = struct {
);
errdefer account_index.deinit(true);

const stats = try AccountsDBStats.init();

const metrics = try AccountsDBMetrics.init();
return .{
.allocator = allocator,
.disk_allocator_ptr = maybe_disk_allocator_ptr,
Expand All @@ -264,7 +184,7 @@ pub const AccountsDB = struct {
.account_cache = RwMux(AccountCache).init(AccountCache.init(allocator)),
.snapshot_dir = snapshot_dir,
.dead_accounts_counter = RwMux(DeadAccountsCounter).init(DeadAccountsCounter.init(allocator)),
.stats = stats,
.metrics = metrics,
.geyser_writer = geyser_writer,
};
}
Expand Down Expand Up @@ -1423,7 +1343,7 @@ pub const AccountsDB = struct {
pub fn flushSlot(self: *Self, slot: Slot) !FileId {
var timer = try sig.time.Timer.start();

defer self.stats.number_files_flushed.inc();
defer self.metrics.number_files_flushed.inc();

const pubkeys, const accounts: []const Account = blk: {
// NOTE: flush should be the only function to delete/free cache slices of a flushed slot
Expand All @@ -1441,7 +1361,7 @@ pub const AccountsDB = struct {
for (accounts) |*account| {
const account_size_in_file = account.getSizeInFile();
size += account_size_in_file;
self.stats.flush_account_file_size.observe(@floatFromInt(account_size_in_file));
self.metrics.flush_account_file_size.observe(account_size_in_file);
}

const file, const file_id, const memory = try self.createAccountFile(size, slot);
Expand Down Expand Up @@ -1469,7 +1389,7 @@ pub const AccountsDB = struct {
try file_map.putNoClobber(self.allocator, file_id, account_file);
}

self.stats.flush_accounts_written.add(account_file.number_of_accounts);
self.metrics.flush_accounts_written.add(account_file.number_of_accounts);

// update the reference AFTER the data exists
for (pubkeys, offsets) |pubkey, offset| {
Expand Down Expand Up @@ -1510,7 +1430,7 @@ pub const AccountsDB = struct {
self.allocator.free(pubkeys);
}

self.stats.time_flush.observe(@floatFromInt(timer.read().asNanos()));
self.metrics.time_flush.observe(timer.read().asNanos());

// return to queue for cleaning
return file_id;
Expand All @@ -1536,7 +1456,7 @@ pub const AccountsDB = struct {

defer {
const number_of_files = unclean_account_files.len;
self.stats.number_files_cleaned.add(number_of_files);
self.metrics.number_files_cleaned.add(number_of_files);
}

var num_zero_lamports: usize = 0;
Expand Down Expand Up @@ -1659,19 +1579,19 @@ pub const AccountsDB = struct {
}
}
references_to_delete.clearRetainingCapacity();
self.stats.clean_references_deleted.set(references_to_delete.items.len);
self.metrics.clean_references_deleted.set(references_to_delete.items.len);
self.logger.debug().logf(
"cleaned slot {} - old_state: {}, zero_lamports: {}",
.{ account_file.slot, num_old_states, num_zero_lamports },
);
}

self.stats.clean_files_queued_deletion.set(delete_account_files.count());
self.stats.clean_files_queued_shrink.set(delete_account_files.count());
self.stats.clean_slot_old_state.set(num_old_states);
self.stats.clean_slot_zero_lamports.set(num_zero_lamports);
self.metrics.clean_files_queued_deletion.set(delete_account_files.count());
self.metrics.clean_files_queued_shrink.set(delete_account_files.count());
self.metrics.clean_slot_old_state.set(num_old_states);
self.metrics.clean_slot_zero_lamports.set(num_zero_lamports);

self.stats.time_clean.observe(@floatFromInt(timer.read().asNanos()));
self.metrics.time_clean.observe(timer.read().asNanos());
return .{
.num_zero_lamports = num_zero_lamports,
.num_old_states = num_old_states,
Expand All @@ -1686,7 +1606,7 @@ pub const AccountsDB = struct {
) !void {
const number_of_files = delete_account_files.len;
defer {
self.stats.number_files_deleted.add(number_of_files);
self.metrics.number_files_deleted.add(number_of_files);
}

var delete_queue = try std.ArrayList(AccountFile).initCapacity(
Expand Down Expand Up @@ -1772,7 +1692,7 @@ pub const AccountsDB = struct {

defer {
const number_of_files = shrink_account_files.len;
self.stats.number_files_shrunk.add(number_of_files);
self.metrics.number_files_shrunk.add(number_of_files);
}

var alive_pubkeys = std.AutoArrayHashMap(Pubkey, void).init(self.allocator);
Expand Down Expand Up @@ -1834,9 +1754,9 @@ pub const AccountsDB = struct {
std.debug.assert(accounts_dead_count > 0);
total_accounts_deleted += accounts_dead_count;

self.stats.shrink_alive_accounts.observe(@floatFromInt(accounts_alive_count));
self.stats.shrink_dead_accounts.observe(@floatFromInt(accounts_dead_count));
self.stats.shrink_file_shrunk_by.observe(@floatFromInt(accounts_dead_size));
self.metrics.shrink_alive_accounts.observe(accounts_alive_count);
self.metrics.shrink_dead_accounts.observe(accounts_dead_count);
self.metrics.shrink_file_shrunk_by.observe(accounts_dead_size);

self.logger.debug().logf("n alive accounts: {}", .{accounts_alive_count});
self.logger.debug().logf("n dead accounts: {}", .{accounts_dead_count});
Expand Down Expand Up @@ -1947,7 +1867,7 @@ pub const AccountsDB = struct {
}
}

self.stats.time_shrink.observe(@floatFromInt(timer.read().asNanos()));
self.metrics.time_shrink.observe(timer.read().asNanos());

return .{
.num_accounts_deleted = total_accounts_deleted,
Expand Down Expand Up @@ -1995,7 +1915,7 @@ pub const AccountsDB = struct {
self.allocator.free(accounts);
self.allocator.free(pubkeys);

self.stats.time_purge.observe(@floatFromInt(timer.read().asNanos()));
self.metrics.time_purge.observe(timer.read().asNanos());
}

// NOTE: we need to acquire locks which requires `self: *Self` but we never modify any data
Expand Down Expand Up @@ -2831,6 +2751,68 @@ pub const AccountsDB = struct {
}
};

pub const AccountsDBMetrics = struct {
number_files_flushed: *Counter,
number_files_cleaned: *Counter,
number_files_shrunk: *Counter,
number_files_deleted: *Counter,

time_flush: *Histogram,
time_clean: *Histogram,
time_shrink: *Histogram,
time_purge: *Histogram,

flush_account_file_size: *Histogram,
flush_accounts_written: *Counter,

clean_references_deleted: *Gauge(u64),
clean_files_queued_deletion: *Gauge(u64),
clean_files_queued_shrink: *Gauge(u64),
clean_slot_old_state: *Gauge(u64),
clean_slot_zero_lamports: *Gauge(u64),

shrink_file_shrunk_by: *Histogram,
shrink_alive_accounts: *Histogram,
shrink_dead_accounts: *Histogram,

const Self = @This();

pub fn init() GetMetricError!Self {
return globalRegistry().initStruct(Self);
}

pub fn histogramBucketsForField(comptime field_name: []const u8) []const f64 {
const HistogramKind = enum {
flush_account_file_size,
shrink_file_shrunk_by,
shrink_alive_accounts,
shrink_dead_accounts,
time_flush,
time_clean,
time_shrink,
time_purge,
};

const account_size_buckets = &.{
// 10 bytes -> 10MB (solana max account size)
10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000,
};
const account_count_buckets = &.{ 1, 5, 10, 50, 100, 500, 1_000, 10_000 };
const nanosecond_buckets = &.{
// 0.01ms -> 10ms
1_000, 10_000, 100_000, 1_000_000, 10_000_000,
// 50ms -> 1000ms
50_000_000, 100_000_000, 200_000_000, 400_000_000, 1_000_000_000,
};

return switch (@field(HistogramKind, field_name)) {
.flush_account_file_size, .shrink_file_shrunk_by => account_size_buckets,
.shrink_alive_accounts, .shrink_dead_accounts => account_count_buckets,
.time_flush, .time_clean, .time_shrink, .time_purge => nanosecond_buckets,
};
}
};

/// this is used when loading from a snapshot. it uses a fixed buffer allocator
/// to allocate memory which is used to clone account data slices of account files.
/// the memory is serialized into bincode and sent through the pipe.
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/cmd.zig
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ fn validator() !void {
.allocator = allocator,
.logger = app_base.logger,
.random = rng.random(),
.registry = app_base.metrics_registry,
.my_keypair = &app_base.my_keypair,
.exit = &app_base.exit,
.gossip_table_rw = &gossip_service.gossip_table_rw,
Expand Down Expand Up @@ -831,6 +832,7 @@ fn shredCollector() !void {
.allocator = allocator,
.logger = app_base.logger,
.random = rng.random(),
.registry = app_base.metrics_registry,
.my_keypair = &app_base.my_keypair,
.exit = &app_base.exit,
.gossip_table_rw = &gossip_service.gossip_table_rw,
Expand Down
Loading

0 comments on commit 9c21b67

Please sign in to comment.