From 26987ac918fd2d3e8192a7cdf52bcf6ec7b4e0a2 Mon Sep 17 00:00:00 2001 From: yusong-yan Date: Tue, 24 Dec 2024 00:47:16 +0000 Subject: [PATCH] [#25422] xCluster: Change Aggregation Function for Certain xCluster Metrics from Sum to Max Summary: `kSum` aggregation is currently used for many xCluster metrics, but this leads to overflows and inaccuracies for metrics like `last_read_hybridtime`, which should instead use maximum aggregation `kMax`. Next Step [[ https://github.com/yugabyte/yugabyte-db/issues/25437 | #25437 ]]: For optimal accuracy in the future, we should implement a `kLatest` aggregation function, which would require metric storing both the latest value and its corresponding timestamp. Jira: DB-14654 Test Plan: Jenkins Reviewers: hsunder, xCluster, jhe Reviewed By: jhe Subscribers: rthallam, ybase, ycdcxcluster, slingam Differential Revision: https://phorge.dev.yugabyte.com/D40873 --- src/yb/cdc/xrepl_metrics.cc | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/yb/cdc/xrepl_metrics.cc b/src/yb/cdc/xrepl_metrics.cc index 6bc1e1d5768a..cad4e3d2e4cc 100644 --- a/src/yb/cdc/xrepl_metrics.cc +++ b/src/yb/cdc/xrepl_metrics.cc @@ -48,35 +48,42 @@ METRIC_DEFINE_counter(xcluster, rpc_heartbeats_responded, "xCluster Rpc Heartbea METRIC_DEFINE_gauge_int64(xcluster, last_read_opid_term, "xCluster Last Read OpId (Term)", yb::MetricUnit::kOperations, "ID of the Last Read Producer Operation from a xCluster GetChanges request. Format = " - "term.index"); + "term.index", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_int64(xcluster, last_read_opid_index, "xCluster Last Read OpId (Index)", yb::MetricUnit::kOperations, "ID of the Last Read Producer Operation from a xCluster GetChanges request. Format = " - "term.index"); + "term.index", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_int64(xcluster, last_checkpoint_opid_index, "xCluster Last Checkpoint OpId " "(Index)", yb::MetricUnit::kOperations, "ID of the Last Checkpoint Sent by Consumer in a xCluster GetChanges request. Format = " - "term.index"); + "term.index", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_uint64(xcluster, last_read_hybridtime, "xCluster Last Read HybridTime.", yb::MetricUnit::kMicroseconds, - "HybridTime of the Last Read Operation from a xCluster GetChanges request"); + "HybridTime of the Last Read Operation from a xCluster GetChanges request", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_uint64(xcluster, last_read_physicaltime, "xCluster Last Read Physical TIme.", yb::MetricUnit::kMicroseconds, - "Physical Time of the Last Read Operation from a xCluster GetChanges request"); + "Physical Time of the Last Read Operation from a xCluster GetChanges request", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_uint64(xcluster, last_checkpoint_physicaltime, "xCluster Last Committed Physical Time.", yb::MetricUnit::kMicroseconds, - "Physical Time of the Last Committed Operation on Consumer."); + "Physical Time of the Last Committed Operation on Consumer.", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_int64(xcluster, last_readable_opid_index, "xCluster Last Readable OpId (Index)", yb::MetricUnit::kOperations, - "Index of the Last Producer Operation that a xCluster GetChanges request COULD read."); + "Index of the Last Producer Operation that a xCluster GetChanges request COULD read.", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_int64(xcluster, async_replication_sent_lag_micros, "xCluster Physical Time Lag Last Sent", @@ -91,7 +98,8 @@ METRIC_DEFINE_gauge_int64(xcluster, async_replication_committed_lag_micros, METRIC_DEFINE_gauge_bool(xcluster, is_bootstrap_required, "Is Bootstrap Required", yb::MetricUnit::kUnits, - "Is bootstrap required for the replication universe."); + "Is bootstrap required for the replication universe.", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); METRIC_DEFINE_gauge_uint64(xcluster, last_getchanges_time, "xCluster Last GetChanges Physical Time", yb::MetricUnit::kMicroseconds, @@ -109,7 +117,8 @@ METRIC_DEFINE_gauge_int64(xcluster, time_since_last_getchanges, METRIC_DEFINE_gauge_uint64(xcluster, last_caughtup_physicaltime, "xCluster Last Caught-up Physical Time.", yb::MetricUnit::kMicroseconds, - "Physical Time till which consumer has caught-up with producer."); + "Physical Time till which consumer has caught-up with producer.", + {0 /* zero means we don't expose it as counter */, yb::AggregationFunction::kMax}); // CdcSdk Tablet metrics. METRIC_DEFINE_gauge_int64(cdcsdk, cdcsdk_sent_lag_micros, "CDCSDK sent Lag",