Skip to content

Commit

Permalink
feature(plugin-nvidia):collect nvml process metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
ARTHUR CHENU (ext) committed Jan 27, 2025
1 parent 00547eb commit 52114e5
Showing 1 changed file with 210 additions and 74 deletions.
284 changes: 210 additions & 74 deletions plugin-nvidia/src/nvml.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,104 @@
use std::fmt::Display;
use std::sync::Arc;
use anyhow::Context;
use nvml_wrapper::{enum_wrappers::device::TemperatureSensor, error::NvmlError, Device, Nvml};
use nvml_wrapper_sys::bindings::nvmlDevice_t;
use std::{fmt::Display, sync::Arc};

use alumet::measurement::Timestamp;
use alumet::metrics::MetricCreationError;
use alumet::resources::ResourceConsumer;
use alumet::units::PrefixedUnit;
use alumet::{
measurement::{MeasurementAccumulator, MeasurementPoint},
metrics::TypedMetricId,
measurement::{MeasurementAccumulator, MeasurementPoint, Timestamp},
metrics::{MetricCreationError, TypedMetricId},
pipeline::elements::error::PollError,
plugin::util::CounterDiff,
plugin::AlumetPluginStart,
resources::Resource,
units::Unit,
plugin::{util::CounterDiff, AlumetPluginStart},
resources::{Resource, ResourceConsumer},
units::{PrefixedUnit, Unit},
};
use anyhow::Context;
use nvml_wrapper::{error::NvmlError, Device, Nvml};
use nvml_wrapper_sys::bindings::nvmlDevice_t;

/// Contains the ids of the measured metrics.
#[derive(Clone)]
pub struct Metrics {
/// Total electric energy consumed by GPU.
total_energy_consumption: TypedMetricId<u64>,
/// Electric energy consumption measured on the instant.
instant_power: TypedMetricId<u64>,
/// GPU temperature in °C
temperature_gpu: TypedMetricId<u64>,
/// GPU rate utilization in percentage
major_utilization_gpu: TypedMetricId<u64>,
/// GPU memory utilization in percentage
major_utilization_memory: TypedMetricId<u64>,
decoder_utilization: TypedMetricId<u64>,
/// Get the current utilization and sampling size for the decoder in μs.
decoder_sampling_period_us: TypedMetricId<u64>,
encoder_utilization: TypedMetricId<u64>,
/// Get the current utilization and sampling size for the encoder in μs.
encoder_sampling_period_us: TypedMetricId<u64>,
/// Time consumed by the streaming multiprocessors of a GPU in percentage.
sm_utilization: TypedMetricId<u64>,
/// Relevant currently running computing processes data in percentage.
running_compute_processes: TypedMetricId<u64>,
/// Relevant currently running graphical processes data in percentage.
running_graphics_processes: TypedMetricId<u64>,
}

impl Metrics {
/// Provides a information base to create metric before sending GPU data,
/// with `name`, `unit` and `description` parameters.
///
/// # Arguments
///
/// * `alumet` - A AlumetPluginStart structure passed to plugins for the start-up phase.
///
/// # Error
///
/// Return `MetricCreationError` when an error occur during creation a new metric.
pub fn new(alumet: &mut AlumetPluginStart) -> Result<Self, MetricCreationError> {
Ok(Self {
total_energy_consumption: alumet.create_metric(
"nvml_energy_consumption",
PrefixedUnit::milli(Unit::Joule),
"energy consumption by the GPU (including memory) since the previous measurement",
)?,
instant_power: alumet.create_metric(
"nvml_instant_power",
PrefixedUnit::milli(Unit::Watt),
"instantaneous power of the GPU at the time of the measurement",
)?,
temperature_gpu: alumet.create_metric(
"nvml_temperature_gpu",
Unit::DegreeCelsius,
"instantaneous temperature of the GPU at the time of the measurement",
)?,
major_utilization_gpu: alumet.create_metric("nvml_gpu_utilization", Unit::Unity, "")?,
major_utilization_memory: alumet.create_metric("nvml_memory_utilization", Unit::Unity, "")?,
decoder_utilization: alumet.create_metric("nvml_decoder_utilization", Unit::Unity, "")?,
encoder_utilization: alumet.create_metric("nvml_encoder_utilization", Unit::Unity, "")?,
decoder_sampling_period_us: alumet.create_metric(
"nvml_decoder_sampling_period",
PrefixedUnit::micro(Unit::Second),
"",
)?,
encoder_sampling_period_us: alumet.create_metric(
"nvml_encoder_sampling_period",
PrefixedUnit::micro(Unit::Second),
"",
)?,
sm_utilization: alumet.create_metric(
"nvml_sm_utilization",
Unit::Unity,
"Percentage of time that the Streaming Multiprocessors of a GPU",
)?,
running_compute_processes: alumet.create_metric(
"nvml_n_compute_processes",
Unit::Unity,
"number of compute processes running on the device",
)?,
running_graphics_processes: alumet.create_metric(
"nvml_n_graphic_processes",
Unit::Unity,
"number of graphic processes running on the device",
)?,
})
}
}

/// Detected NVML devices.
pub struct NvmlDevices {
Expand Down Expand Up @@ -97,18 +179,29 @@ impl alumet::pipeline::Source for NvmlSource {
}
}

// Get power consumption in milliWatts
if features.instant_power {
// the power in milliWatts
let milli_watts = device.power_usage()?;
measurements.push(MeasurementPoint::new(
timestamp,
self.metrics.instant_power,
self.resource.clone(),
consumer.clone(),
milli_watts as u64,
device.power_usage()? as u64,
))
}

// Get temperature of GPU in °C
if features.temperature_gpu {
measurements.push(MeasurementPoint::new(
timestamp,
self.metrics.temperature_gpu,
self.resource.clone(),
consumer.clone(),
device.temperature(TemperatureSensor::Gpu)? as u64,
));
}

// Get the current utilization rates memory for this device major subsystems in percentage
if features.major_utilization {
let u = device.utilization_rates()?;
measurements.push(MeasurementPoint::new(
Expand All @@ -127,6 +220,7 @@ impl alumet::pipeline::Source for NvmlSource {
));
}

// Get the current utilization and sampling size in μs for the decoder
if features.decoder_utilization {
let u = device.decoder_utilization()?;
measurements.push(MeasurementPoint::new(
Expand All @@ -145,6 +239,7 @@ impl alumet::pipeline::Source for NvmlSource {
));
}

// Get the current utilization and sampling size in μs for the encoder
if features.encoder_utilization {
let u = device.encoder_utilization()?;
measurements.push(MeasurementPoint::new(
Expand Down Expand Up @@ -193,64 +288,99 @@ impl alumet::pipeline::Source for NvmlSource {
));
}

// TODO explore device.samples() to gather multiple metrics at once
Ok(())
}
}
/// Creating a measurement point to push for processes with a compute context running on GPU device.
///
/// # Return
///
/// Return an error when occur during creation of a new measurement point.
fn process_utilization_metrics(
device: &Device,
processes: &[nvml_wrapper::struct_wrappers::device::ProcessInfo],
measurements: &mut MeasurementAccumulator,
timestamp: Timestamp,
resource: Resource,
metrics: &Metrics,
) -> Result<(), PollError> {
// Gets information about processes with a compute context running on this Device
for process in processes {
let pid = process.pid;
let consumer = ResourceConsumer::Process { pid };

for sm in device.process_utilization_stats(pid as u64)? {
// SM 3D compute utilization : Refers to the percentage of time that the Streaming Multiprocessors (SMs) of a GPU
measurements.push(MeasurementPoint::new(
timestamp,
metrics.sm_utilization,
resource.clone(),
consumer.clone(),
sm.sm_util as u64,
));
// Frame buffer memory utilization
measurements.push(MeasurementPoint::new(
timestamp,
metrics.major_utilization_memory,
resource.clone(),
consumer.clone(),
sm.mem_util as u64,
));
// Encoder utilization
measurements.push(MeasurementPoint::new(
timestamp,
metrics.encoder_utilization,
resource.clone(),
consumer.clone(),
sm.enc_util as u64,
));
// Decoder utilization
measurements.push(MeasurementPoint::new(
timestamp,
metrics.decoder_utilization,
resource.clone(),
consumer.clone(),
sm.dec_util as u64,
));
}
}
Ok(())
}

/// Contains the ids of the measured metrics.
#[derive(Clone)]
pub struct Metrics {
total_energy_consumption: TypedMetricId<u64>,
instant_power: TypedMetricId<u64>,
major_utilization_gpu: TypedMetricId<u64>,
major_utilization_memory: TypedMetricId<u64>,
decoder_utilization: TypedMetricId<u64>,
decoder_sampling_period_us: TypedMetricId<u64>,
encoder_utilization: TypedMetricId<u64>,
encoder_sampling_period_us: TypedMetricId<u64>,
running_compute_processes: TypedMetricId<u64>,
running_graphics_processes: TypedMetricId<u64>,
}
// Gets utilization stats for relevant currently running computing processes
let sm_compute_processes = match features.running_compute_processes {
AvailableVersion::Latest => Some(device.running_compute_processes()?),
AvailableVersion::V2 => Some(device.running_compute_processes_v2()?),
AvailableVersion::None => None,
};

impl Metrics {
pub fn new(alumet: &mut AlumetPluginStart) -> Result<Self, MetricCreationError> {
Ok(Self {
total_energy_consumption: alumet.create_metric(
"nvml_energy_consumption",
PrefixedUnit::milli(Unit::Joule),
"energy consumption by the GPU (including memory) since the previous measurement",
)?,
instant_power: alumet.create_metric(
"nvml_instant_power",
PrefixedUnit::milli(Unit::Watt),
"instantaneous power of the GPU at the time of the measurement",
)?,
major_utilization_gpu: alumet.create_metric("nvml_gpu_utilization", Unit::Unity, "")?,
major_utilization_memory: alumet.create_metric("nvml_memory_utilization", Unit::Unity, "")?,
decoder_utilization: alumet.create_metric("nvml_decoder_utilization", Unit::Unity, "")?,
encoder_utilization: alumet.create_metric("nvml_encoder_utilization", Unit::Unity, "")?,
decoder_sampling_period_us: alumet.create_metric(
"nvml_decoder_sampling_period",
PrefixedUnit::micro(Unit::Second),
"",
)?,
encoder_sampling_period_us: alumet.create_metric(
"nvml_encoder_sampling_period",
PrefixedUnit::micro(Unit::Second),
"",
)?,
running_compute_processes: alumet.create_metric(
"nvml_n_compute_processes",
Unit::Unity,
"number of compute processes running on the device",
)?,
running_graphics_processes: alumet.create_metric(
"nvml_n_graphic_processes",
Unit::Unity,
"number of graphic processes running on the device",
)?,
})
if let Some(processes) = sm_compute_processes {
process_utilization_metrics(
&device,
&processes,
measurements,
timestamp,
self.resource.clone(),
&self.metrics,
)?;
}

// Gets utilization stats for relevant currently running graphical processes
let sm_graphic_processes = match features.running_graphics_processes {
AvailableVersion::Latest => Some(device.running_graphics_processes()?),
AvailableVersion::V2 => Some(device.running_graphics_processes_v2()?),
AvailableVersion::None => None,
};

if let Some(processes) = sm_graphic_processes {
process_utilization_metrics(
&device,
&processes,
measurements,
timestamp,
self.resource.clone(),
&self.metrics,
)?;
}

Ok(())
}
}

Expand All @@ -259,6 +389,7 @@ impl Metrics {
pub struct OptionalFeatures {
total_energy_consumption: bool,
instant_power: bool,
temperature_gpu: bool,
major_utilization: bool,
decoder_utilization: bool,
encoder_utilization: bool,
Expand All @@ -280,6 +411,7 @@ impl OptionalFeatures {
Ok(Self {
total_energy_consumption: is_supported(device.total_energy_consumption())?,
instant_power: is_supported(device.power_usage())?,
temperature_gpu: is_supported(device.temperature(TemperatureSensor::Gpu))?,
major_utilization: is_supported(device.utilization_rates())?,
decoder_utilization: is_supported(device.decoder_utilization())?,
encoder_utilization: is_supported(device.encoder_utilization())?,
Expand All @@ -298,6 +430,7 @@ impl OptionalFeatures {
|| self.major_utilization
|| self.decoder_utilization
|| self.encoder_utilization
|| self.temperature_gpu
|| self.running_compute_processes != AvailableVersion::None
|| self.running_graphics_processes != AvailableVersion::None
}
Expand All @@ -321,6 +454,9 @@ impl Display for OptionalFeatures {
if self.encoder_utilization {
available.push("encoder_utilization");
}
if self.temperature_gpu {
available.push("temperature_gpu");
}
match self.running_compute_processes {
AvailableVersion::Latest => available.push("running_compute_processes(latest)"),
AvailableVersion::V2 => available.push("running_compute_processes(v2)"),
Expand Down

0 comments on commit 52114e5

Please sign in to comment.