Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add installed_extensions prometheus metric #9608

Merged
merged 8 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions compute_tools/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ clap.workspace = true
flate2.workspace = true
futures.workspace = true
hyper0 = { workspace = true, features = ["full"] }
metrics.workspace = true
nix.workspace = true
notify.workspace = true
num_cpus.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
postgres.workspace = true
Expand All @@ -39,6 +41,7 @@ tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true
prometheus.workspace = true

compute_api.workspace = true
utils.workspace = true
Expand Down
25 changes: 25 additions & 0 deletions compute_tools/src/http/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use crate::installed_extensions;
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
use compute_api::responses::{
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
Expand All @@ -19,6 +20,8 @@ use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use metrics::Encoder;
use metrics::TextEncoder;
use tokio::task;
use tracing::{debug, error, info, warn};
use tracing_utils::http::OtelName;
Expand Down Expand Up @@ -65,6 +68,28 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
}

// Prometheus metrics
(&Method::GET, "/metrics") => {
debug!("serving /metrics GET request");

let mut buffer = vec![];
let metrics = installed_extensions::collect();
let encoder = TextEncoder::new();
encoder.encode(&metrics, &mut buffer).unwrap();

match Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
{
Ok(response) => response,
Err(err) => {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Collect Postgres current usage insights
(&Method::GET, "/insights") => {
info!("serving /insights GET request");
Expand Down
15 changes: 15 additions & 0 deletions compute_tools/src/http/openapi_spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ paths:
schema:
$ref: "#/components/schemas/ComputeMetrics"

/metrics
get:
tags:
- Info
summary: Get compute node metrics in text format.
description: ""
operationId: getComputeMetrics
responses:
200:
description: ComputeMetrics
content:
text/plain:
schema:
type: string
description: Metrics in text format.
/insights:
get:
tags:
Expand Down
31 changes: 28 additions & 3 deletions compute_tools/src/installed_extensions.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use metrics::proto::MetricFamily;
use std::collections::HashMap;
use std::collections::HashSet;
use tracing::info;
Expand All @@ -8,6 +9,10 @@ use anyhow::Result;
use postgres::{Client, NoTls};
use tokio::task;

use metrics::core::Collector;
use metrics::{register_uint_gauge_vec, UIntGaugeVec};
use once_cell::sync::Lazy;

/// We don't reuse get_existing_dbs() just for code clarity
/// and to make database listing query here more explicit.
///
Expand Down Expand Up @@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension

for (extname, v) in extensions.iter() {
let version = v.to_string();

// increment the number of databases where the version of extension is installed
INSTALLED_EXTENSIONS
.with_label_values(&[extname, &version])
.inc();

extensions_map
.entry(extname.to_string())
.and_modify(|e| {
Expand All @@ -74,9 +85,11 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
}
}

Ok(InstalledExtensions {
let res = InstalledExtensions {
extensions: extensions_map.values().cloned().collect(),
})
};

Ok(res)
})
.await?
}
Expand All @@ -97,6 +110,18 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
"[NEON_EXT_STAT] {}",
serde_json::to_string(&result).expect("failed to serialize extensions list")
);

Ok(())
}

static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"installed_extensions",
"Number of databases where the version of extension is installed",
&["extension_name", "version"]
)
.expect("failed to define a metric")
});

pub fn collect() -> Vec<MetricFamily> {
INSTALLED_EXTENSIONS.collect()
}
5 changes: 5 additions & 0 deletions test_runner/fixtures/endpoint/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@ def set_role_grants(self, database: str, role: str, schema: str, privileges: lis
)
res.raise_for_status()
return res.json()

def metrics(self) -> str:
res = self.get(f"http://localhost:{self.port}/metrics")
res.raise_for_status()
return res.text
57 changes: 56 additions & 1 deletion test_runner/regress/test_installed_extensions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import time
lubennikovaav marked this conversation as resolved.
Show resolved Hide resolved
from logging import info
from typing import TYPE_CHECKING

from fixtures.neon_fixtures import NeonEnv
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics

if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv


def test_installed_extensions(neon_simple_env: NeonEnv):
Expand Down Expand Up @@ -85,3 +91,52 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
assert ext["n_databases"] == 2
ext["versions"].sort()
assert ext["versions"] == ["1.2", "1.3"]

# check that /metrics endpoint is available
# ensure that we see the metric before and after restart
res = client.metrics()
info("Metrics: %s", res)
m = parse_metrics(res)
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 2
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 1

endpoint.stop()
endpoint.start()
tristan957 marked this conversation as resolved.
Show resolved Hide resolved

timeout = 10
while timeout > 0:
try:
res = client.metrics()
timeout = -1
if len(parse_metrics(res).query_all("installed_extensions")) < 4:
# Assume that not all metrics that are collected yet
time.sleep(1)
timeout -= 1
continue
except Exception as e:
lubennikovaav marked this conversation as resolved.
Show resolved Hide resolved
log.exception("failed to get metrics, assume they are not collected yet")
time.sleep(1)
timeout -= 1
continue

assert (
len(parse_metrics(res).query_all("installed_extensions")) >= 4
), "Not all metrics are collected"

info("After restart metrics: %s", res)
m = parse_metrics(res)
neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 1

neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
assert len(neon_m) == 1
for sample in neon_m:
assert sample.value == 1
Loading