Skip to content

Commit

Permalink
[Refactor] Complete metrics overhaul
Browse files Browse the repository at this point in the history
Metrics got an entire overhaul. Instead of relying on a broken
prometheus library to publish our metrics, we now use the
`tracing` library and with OpenTelemetry that we bind together
then publish into a prometheus library.

Metrics are now mostly derive-macros. This means that the struct
can express what it wants to export and a help text. The library
will choose if it is able to export it.

Tracing now works by calling `.publish()` on the parent structs,
those structs need to call `.publish()` on all the child members
it wishes to publish data about. If a "group" is requested, use
the `group!()` macro, which under-the-hood calls `tracing::span`
with some special labels. At primitive layers, it will call the
`publish!()` macro, which will call `tracing::event!()` macro
under-the-hood with some special fields set. A custom
`tracing::Subscriber` will intercept all the events and spans
and convert them into a json-like object. This object can then
be exported as real json or encoded into other formats like
otel/prometheus.

closes: TraceMachina#1164, TraceMachina#650, TraceMachina#384, TraceMachina#209
towards: TraceMachina#206
  • Loading branch information
allada committed Jul 25, 2024
1 parent 3574149 commit 94c7c84
Show file tree
Hide file tree
Showing 72 changed files with 2,647 additions and 1,466 deletions.
306 changes: 306 additions & 0 deletions Cargo.lock

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ nativelink-service = { path = "nativelink-service" }
nativelink-store = { path = "nativelink-store" }
nativelink-util = { path = "nativelink-util" }
nativelink-worker = { path = "nativelink-worker" }
nativelink-metric = { path = "nativelink-metric" }
nativelink-metric-collector = { path = "nativelink-metric-collector" }

async-lock = "3.3.0"
axum = "0.6.20"
Expand All @@ -58,3 +60,12 @@ tokio-rustls = "0.25.0"
tonic = { version = "0.11.0", features = ["gzip", "tls"] }
tower = "0.4.13"
tracing = "0.1.40"
opentelemetry_sdk = { version = "0.23.0", features = ["metrics"] }
tracing-subscriber = "0.3.18"
tracing-opentelemetry = { version = "0.25.0", features = ["metrics"] }
opentelemetry-stdout = "0.5.0"
opentelemetry_api = { version = "0.20.0", features = ["metrics"] }
opentelemetry = { version = "0.23.0", features = ["metrics"] }
prometheus = "0.13.4"
opentelemetry-prometheus = "0.16.0"
serde_json = "1.0.120"
1 change: 1 addition & 0 deletions nativelink-error/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ autobenches = false

[dependencies]
nativelink-proto = { path = "../nativelink-proto" }
nativelink-metric = { path = "../nativelink-metric" }

hex = "0.4.3"
prost = "0.12.4"
Expand Down
13 changes: 13 additions & 0 deletions nativelink-error/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use nativelink_metric::{
MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent,
};
use prost_types::TimestampError;
use serde::{Deserialize, Serialize};

Expand Down Expand Up @@ -47,6 +50,16 @@ pub struct Error {
pub messages: Vec<String>,
}

impl MetricsComponent for Error {
fn publish(
&self,
kind: MetricKind,
field_metadata: MetricFieldData,
) -> Result<MetricPublishKnownKindData, nativelink_metric::Error> {
self.to_string().publish(kind, field_metadata)
}
}

impl Error {
pub fn new(code: Code, msg: String) -> Self {
let mut msgs = Vec::with_capacity(1);
Expand Down
18 changes: 18 additions & 0 deletions nativelink-metric-collector/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "nativelink-metric-collector"
version = "0.4.0"
edition = "2021"
rust-version = "1.79.0"

[dependencies]
nativelink-metric = { path = "../nativelink-metric" }

tracing = "0.1.40"
tracing-subscriber = "0.3.18"
opentelemetry = { version = "0.23.0", features = ["metrics"] }
parking_lot = "0.12.2"
serde = "1.0.204"

[dev-dependencies]
nativelink-macro = { path = "../nativelink-macro" }
nativelink-error = { path = "../nativelink-error" }
21 changes: 21 additions & 0 deletions nativelink-metric-collector/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2024 The NativeLink Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

pub use otel_exporter::otel_export;
pub use tracing_layers::MetricsCollectorLayer;

mod metrics_collection;
mod metrics_visitors;
mod otel_exporter;
mod tracing_layers;
90 changes: 90 additions & 0 deletions nativelink-metric-collector/src/metrics_collection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright 2024 The NativeLink Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::{
borrow::Cow,
collections::HashMap,
ops::{Deref, DerefMut},
};

use serde::Serialize;

use crate::metrics_visitors::CollectionKind;

/// The final-metric primitive value that was collected with type.
#[derive(Debug, Serialize)]
#[serde(untagged)]
pub enum CollectedMetricPrimitiveValue {
Counter(u64),
String(Cow<'static, str>),
}

/// The final-metric primitive field that was collected.
#[derive(Default, Debug)]
pub struct CollectedMetricPrimitive {
pub value: Option<CollectedMetricPrimitiveValue>,
pub help: String,
pub value_type: CollectionKind,
}

impl Serialize for CollectedMetricPrimitive {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
match &self.value {
Some(CollectedMetricPrimitiveValue::Counter(value)) => serializer.serialize_u64(*value),
Some(CollectedMetricPrimitiveValue::String(value)) => serializer.serialize_str(value),
None => serializer.serialize_none(),
}
}
}

/// Key-value represented output.
pub type CollectedMetricChildren = HashMap<String, CollectedMetrics>;

/// The type of the collected metric (eg: nested vs primitive).
#[derive(Debug, Serialize)]
#[serde(untagged)]
pub enum CollectedMetrics {
Primitive(CollectedMetricPrimitive),
Component(Box<CollectedMetricChildren>),
}

impl CollectedMetrics {
pub fn new_component() -> Self {
Self::Component(Box::new(CollectedMetricChildren::default()))
}
}

/// The root metric component that was collected.
#[derive(Default, Debug, Serialize)]
pub struct RootMetricCollectedMetrics {
#[serde(flatten)]
inner: CollectedMetricChildren,
}

impl Deref for RootMetricCollectedMetrics {
type Target = CollectedMetricChildren;

fn deref(&self) -> &Self::Target {
&self.inner
}
}

impl DerefMut for RootMetricCollectedMetrics {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
160 changes: 160 additions & 0 deletions nativelink-metric-collector/src/metrics_visitors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Copyright 2024 The NativeLink Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::{borrow::Cow, fmt::Debug};

use nativelink_metric::MetricKind;
use serde::Serialize;
use tracing::field::{Field, Visit};

use crate::metrics_collection::{CollectedMetricPrimitive, CollectedMetricPrimitiveValue};

/// The type of the collected primitive metric.
#[derive(Default, Debug, Serialize)]
pub enum CollectionKind {
#[default]
Counter = 0,
String = 1,
}

impl From<MetricKind> for CollectionKind {
fn from(kind: MetricKind) -> Self {
match kind {
MetricKind::Counter => CollectionKind::Counter,
MetricKind::String => CollectionKind::String,
_ => CollectionKind::String,
}
}
}

/// The final-metric primitive value and type that was collected.
#[derive(Debug)]
enum ValueWithPrimitiveType {
String(String),
U64(u64),
}

impl Default for ValueWithPrimitiveType {
fn default() -> Self {
ValueWithPrimitiveType::U64(0)
}
}

/// An intermediate structed that will have it's contents populated
/// by the `tracing` layer for a given field.
/// This is done by implementing the `Visit` trait and asking the
/// `tracing` library to visit the fields of the captured event
/// and populate this struct.
#[derive(Default, Debug)]
pub struct MetricDataVisitor {
pub name: String,
value: ValueWithPrimitiveType,
help: String,
value_type: Option<CollectionKind>,
}

impl From<MetricDataVisitor> for CollectedMetricPrimitive {
fn from(visitor: MetricDataVisitor) -> Self {
let (value, derived_type) = match visitor.value {
ValueWithPrimitiveType::String(s) => (
CollectedMetricPrimitiveValue::String(Cow::Owned(s)),
CollectionKind::String,
),
ValueWithPrimitiveType::U64(u) => (
CollectedMetricPrimitiveValue::Counter(u),
CollectionKind::Counter,
),
};
CollectedMetricPrimitive {
value: Some(value),
help: visitor.help,
value_type: visitor.value_type.unwrap_or(derived_type),
}
}
}

impl Visit for MetricDataVisitor {
fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {}

fn record_f64(&mut self, field: &Field, value: f64) {
if field.name() == "__value" {
self.value = ValueWithPrimitiveType::String(value.to_string())
}
}
fn record_i64(&mut self, field: &Field, value: i64) {
if field.name() == "__value" {
match u64::try_from(value) {
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
}
}
}
fn record_u64(&mut self, field: &Field, value: u64) {
match field.name() {
"__value" => self.value = ValueWithPrimitiveType::U64(value),
"__type" => self.value_type = Some(MetricKind::from(value).into()),
"__help" => self.help = value.to_string(),
"__name" => self.name = value.to_string(),
field => panic!("UNKNOWN FIELD {field}"),
}
}
fn record_i128(&mut self, field: &Field, value: i128) {
if field.name() == "__value" {
match u64::try_from(value) {
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
}
}
}
fn record_u128(&mut self, field: &Field, value: u128) {
if field.name() == "__value" {
match u64::try_from(value) {
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
}
}
}
fn record_bool(&mut self, field: &Field, value: bool) {
if field.name() == "__value" {
self.value = ValueWithPrimitiveType::U64(u64::from(value));
}
}
fn record_str(&mut self, field: &Field, value: &str) {
match field.name() {
"__value" => self.value = ValueWithPrimitiveType::String(value.to_string()),
"__help" => self.help = value.to_string(),
"__name" => self.name = value.to_string(),
field => panic!("UNKNOWN FIELD {field}"),
}
}
fn record_error(&mut self, _field: &Field, _value: &(dyn std::error::Error + 'static)) {}
}

/// An intermediate structed that will have it's contents populated
/// by the `tracing` layer for a given field.
/// This is the same as `MetricDataVisitor` but only captures info
/// about a given span on span creation.
pub struct SpanFields {
pub name: Cow<'static, str>,
}

impl Visit for SpanFields {
fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {}

fn record_str(&mut self, field: &Field, value: &str) {
if field.name() == "__name" {
self.name = Cow::Owned(value.to_string());
}
}
}
Loading

0 comments on commit 94c7c84

Please sign in to comment.