Skip to content

Commit

Permalink
Add more Pressure Stall Indicator(PSI) metrics to Blazel trace profiler.
Browse files Browse the repository at this point in the history
There are 2 types of PSI metrics: full and some. The "some" line indicates the share of time in which at least some tasks are stalled on a given resource. The "full" line indicates the share of time in which all non-idle tasks are stalled on a given resource simultaneously.There is no full PSI metric for CPU resource.

In this change I've added CPU PSI metric. Also I've added stacked memory and IO metrics with split on some/full types.

PiperOrigin-RevId: 605569491
Change-Id: Ia37faa60e783d8faf8e2bd614cf2899b9704cb5f
  • Loading branch information
Googler authored and copybara-github committed Feb 9, 2024
1 parent 89ea03d commit 9f74a1d
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
package com.google.devtools.build.lib.profiler;

import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorMetric.FULL;
import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorMetric.SOME;
import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorResource.CPU;
import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorResource.IO;
import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorResource.MEMORY;

import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
Expand Down Expand Up @@ -60,6 +65,10 @@ public class CollectLocalResourceUsage implements LocalResourceCollector {
@Nullable
private Map<ProfilerTask, TimeSeries> timeSeries;

@GuardedBy("this")
@Nullable
private List<List<ProfilerTask>> stackedTaskGroups;

private Stopwatch stopwatch;

private final WorkerProcessMetricsCollector workerProcessMetricsCollector;
Expand Down Expand Up @@ -108,6 +117,7 @@ public void run() {
stopwatch = Stopwatch.createStarted();
synchronized (CollectLocalResourceUsage.this) {
timeSeries = new HashMap<>();
stackedTaskGroups = new ArrayList<>();
Duration startTime = stopwatch.elapsed();
List<ProfilerTask> enabledCounters = new ArrayList<>();
enabledCounters.addAll(
Expand All @@ -132,8 +142,20 @@ public void run() {
enabledCounters.add(ProfilerTask.CPU_USAGE_ESTIMATION);
}
if (collectPressureStallIndicators) {
enabledCounters.add(ProfilerTask.PRESSURE_STALL_IO);
enabledCounters.add(ProfilerTask.PRESSURE_STALL_MEMORY);
enabledCounters.add(ProfilerTask.PRESSURE_STALL_FULL_IO);
enabledCounters.add(ProfilerTask.PRESSURE_STALL_FULL_MEMORY);
enabledCounters.add(ProfilerTask.PRESSURE_STALL_SOME_IO);
enabledCounters.add(ProfilerTask.PRESSURE_STALL_SOME_MEMORY);
enabledCounters.add(ProfilerTask.PRESSURE_STALL_SOME_CPU);

// There is no PRESSURE_STALL_FULL_CPU metric, so it's not a stacked counter.
stackedTaskGroups.add(
ImmutableList.of(
ProfilerTask.PRESSURE_STALL_FULL_IO, ProfilerTask.PRESSURE_STALL_SOME_IO));
stackedTaskGroups.add(
ImmutableList.of(
ProfilerTask.PRESSURE_STALL_FULL_MEMORY,
ProfilerTask.PRESSURE_STALL_SOME_MEMORY));
}

for (ProfilerTask counter : enabledCounters) {
Expand Down Expand Up @@ -211,12 +233,21 @@ public void run() {
if (collectLoadAverage) {
loadAverage = osBean.getSystemLoadAverage();
}
double pressureStallIo = 0;
double pressureStallMemory = 0;

// The pressure stall indicator for full CPU metric is not defined.
double pressureStallFullIo = 0;
double pressureStallFullMemory = 0;
double pressureStallSomeIo = 0;
double pressureStallSomeMemory = 0;
double pressureStallSomeCpu = 0;
// The pressure stall indicators are only available on Linux.
if (collectPressureStallIndicators && OS.getCurrent() == OS.LINUX) {
pressureStallIo = ResourceUsage.readPressureStallIndicator("io");
pressureStallMemory = ResourceUsage.readPressureStallIndicator("memory");
pressureStallFullIo = ResourceUsage.readPressureStallIndicator(IO, FULL);
pressureStallFullMemory = ResourceUsage.readPressureStallIndicator(MEMORY, FULL);

pressureStallSomeIo = ResourceUsage.readPressureStallIndicator(IO, SOME);
pressureStallSomeMemory = ResourceUsage.readPressureStallIndicator(MEMORY, SOME);
pressureStallSomeCpu = ResourceUsage.readPressureStallIndicator(CPU, SOME);
}

double deltaNanos = nextElapsed.minus(previousElapsed).toNanos();
Expand Down Expand Up @@ -252,16 +283,31 @@ public void run() {
if (loadAverage > 0) {
addRange(ProfilerTask.SYSTEM_LOAD_AVERAGE, previousElapsed, nextElapsed, loadAverage);
}
if (pressureStallIo >= 0) {
addRange(ProfilerTask.PRESSURE_STALL_IO, previousElapsed, nextElapsed, pressureStallIo);
}
if (pressureStallMemory >= 0) {
addRange(
ProfilerTask.PRESSURE_STALL_MEMORY,
previousElapsed,
nextElapsed,
pressureStallMemory);
}
addRange(
ProfilerTask.PRESSURE_STALL_SOME_CPU,
previousElapsed,
nextElapsed,
pressureStallSomeCpu);
addRange(
ProfilerTask.PRESSURE_STALL_SOME_IO,
previousElapsed,
nextElapsed,
pressureStallSomeIo);
addRange(
ProfilerTask.PRESSURE_STALL_FULL_IO,
previousElapsed,
nextElapsed,
pressureStallFullIo);
addRange(
ProfilerTask.PRESSURE_STALL_SOME_MEMORY,
previousElapsed,
nextElapsed,
pressureStallSomeMemory);
addRange(
ProfilerTask.PRESSURE_STALL_FULL_MEMORY,
previousElapsed,
nextElapsed,
pressureStallFullMemory);
if (systemNetworkUsages != null) {
addRange(
ProfilerTask.SYSTEM_NETWORK_UP_USAGE,
Expand Down Expand Up @@ -318,12 +364,34 @@ synchronized void logCollectedData() {
Profiler profiler = Profiler.instance();

for (ProfilerTask task : timeSeries.keySet()) {
if (isStacked(task)) {
continue;
}
profiler.logCounters(
ImmutableMap.ofEntries(Map.entry(task, timeSeries.get(task).toDoubleArray(len))),
profileStart,
BUCKET_DURATION);
}

for (List<ProfilerTask> taskGroup : stackedTaskGroups) {
ImmutableMap.Builder<ProfilerTask, double[]> stackedCounters = ImmutableMap.builder();
for (ProfilerTask task : taskGroup) {
stackedCounters.put(task, timeSeries.get(task).toDoubleArray(len));
}
profiler.logCounters(stackedCounters.buildOrThrow(), profileStart, BUCKET_DURATION);
}

timeSeries = null;
stackedTaskGroups = null;
}

private synchronized boolean isStacked(ProfilerTask type) {
for (List<ProfilerTask> tasks : stackedTaskGroups) {
if (tasks.contains(type)) {
return true;
}
}
return false;
}

private void addRange(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,11 @@ final class CounterSeriesTraceData implements TraceData {
entry(ProfilerTask.SYSTEM_LOAD_AVERAGE, "generic_work"),
entry(ProfilerTask.MEMORY_USAGE_ESTIMATION, "rail_idle"),
entry(ProfilerTask.CPU_USAGE_ESTIMATION, "cq_build_attempt_passed"),
entry(ProfilerTask.PRESSURE_STALL_IO, "rail_idle"),
entry(ProfilerTask.PRESSURE_STALL_MEMORY, "rail_animation"));
entry(ProfilerTask.PRESSURE_STALL_FULL_IO, "rail_animation"),
entry(ProfilerTask.PRESSURE_STALL_SOME_IO, "cq_build_attempt_failed"),
entry(ProfilerTask.PRESSURE_STALL_FULL_MEMORY, "thread_state_unknown"),
entry(ProfilerTask.PRESSURE_STALL_SOME_MEMORY, "rail_idle"),
entry(ProfilerTask.PRESSURE_STALL_SOME_CPU, "thread_state_running"));

private static final ImmutableMap<ProfilerTask, String> COUNTER_TASK_TO_SERIES_NAME =
ImmutableMap.ofEntries(
Expand All @@ -106,8 +109,11 @@ final class CounterSeriesTraceData implements TraceData {
entry(ProfilerTask.SYSTEM_LOAD_AVERAGE, "load"),
entry(ProfilerTask.MEMORY_USAGE_ESTIMATION, "estimated memory"),
entry(ProfilerTask.CPU_USAGE_ESTIMATION, "estimated cpu"),
entry(ProfilerTask.PRESSURE_STALL_IO, "i/o pressure"),
entry(ProfilerTask.PRESSURE_STALL_MEMORY, "memory pressure"));
entry(ProfilerTask.PRESSURE_STALL_FULL_IO, "i/o pressure (full)"),
entry(ProfilerTask.PRESSURE_STALL_FULL_MEMORY, "memory pressure (full)"),
entry(ProfilerTask.PRESSURE_STALL_SOME_IO, "i/o pressure (some)"),
entry(ProfilerTask.PRESSURE_STALL_SOME_MEMORY, "memory pressure (some)"),
entry(ProfilerTask.PRESSURE_STALL_SOME_CPU, "cpu pressure (some)"));

@Override
public void writeTraceData(JsonWriter jsonWriter, long profileStartTimeNanos) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,11 @@ public enum ProfilerTask {
WORKER_WORKING("waiting for response from worker"),
WORKER_COPYING_OUTPUTS("copying outputs from worker"),
CREDENTIAL_HELPER("calling credential helper"),
PRESSURE_STALL_IO("I/O pressure stall level"),
PRESSURE_STALL_MEMORY("Memory pressure stall level"),
PRESSURE_STALL_FULL_IO("I/O pressure stall level"),
PRESSURE_STALL_FULL_MEMORY("Memory pressure stall level"),
PRESSURE_STALL_SOME_IO("I/O pressure stall level"),
PRESSURE_STALL_SOME_MEMORY("Memory pressure stall level"),
PRESSURE_STALL_SOME_CPU("CPU pressure stall level"),
CONFLICT_CHECK("Conflict checking"),
DYNAMIC_LOCK("Acquiring dynamic execution output lock", Threshold.FIFTY_MILLIS),
REPOSITORY_FETCH("Fetching repository"),
Expand Down
91 changes: 78 additions & 13 deletions src/main/java/com/google/devtools/build/lib/util/ResourceUsage.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,12 @@ public final class ResourceUsage {

private static final MemoryMXBean MEM_BEAN = ManagementFactory.getMemoryMXBean();
private static final Splitter WHITESPACE_SPLITTER = Splitter.on(CharMatcher.whitespace());
private static final Pattern PSI_AVG10_VALUE_PATTERN = Pattern.compile("^full avg10=([\\d.]+).*");
private static final Pattern PSI_AVG10_VALUE_PATTERN_FULL =
Pattern.compile("^full avg10=([\\d.]+).*");
private static final Pattern PSI_AVG10_VALUE_PATTERN_SOME =
Pattern.compile("^some avg10=([\\d.]+).*");
private static final String PSI_AVG10_START_FULL = "full avg10";
private static final String PSI_AVG10_START_SOME = "some avg10";

private ResourceUsage() {}

Expand Down Expand Up @@ -120,8 +125,10 @@ public static Measurement measureCurrentResourceUsage() {
MEM_BEAN.getNonHeapMemoryUsage().getUsed(),
MEM_BEAN.getNonHeapMemoryUsage().getCommitted(),
(float) OS_BEAN.getSystemLoadAverage(),
readPressureStallIndicator("memory"),
readPressureStallIndicator("io"),
readPressureStallIndicator(
PressureStallIndicatorResource.MEMORY, PressureStallIndicatorMetric.FULL),
readPressureStallIndicator(
PressureStallIndicatorResource.IO, PressureStallIndicatorMetric.FULL),
getAvailableMemory(),
getCurrentCpuUtilizationInMs());
}
Expand Down Expand Up @@ -155,21 +162,37 @@ private static long[] getCurrentCpuUtilizationInMs() {
* Reads the Pressure Staller Indicator file for a given type and returns the double value for
* `avg10`, or -1 if we couldn't read that value.
*/
public static float readPressureStallIndicator(String type) {
String fileName = "/proc/pressure/" + type;
public static float readPressureStallIndicator(
PressureStallIndicatorResource resource, PressureStallIndicatorMetric metric) {
String fileName = "/proc/pressure/" + resource.getResource();
File procFile = new File(fileName);
if (!procFile.canRead()) {
return -1.0F;
}
try {
List<String> lines = Files.readLines(procFile, Charset.defaultCharset());
for (String l : lines) {
if (l.startsWith("full avg10")) {
Matcher matcher = PSI_AVG10_VALUE_PATTERN.matcher(l);
if (!matcher.matches()) {
return -1.0F;
}
return Float.parseFloat(matcher.group(1));
for (String line : lines) {
switch (metric) {
case FULL:
// Tries to find a line in file with the `full` metrics
if (!line.startsWith(PSI_AVG10_START_FULL)) {
break;
}
Matcher fullMatcher = PSI_AVG10_VALUE_PATTERN_FULL.matcher(line);
if (!fullMatcher.matches()) {
return -1.0F;
}
return Float.parseFloat(fullMatcher.group(1));
case SOME:
// Tries to find a line in file with the `some` metrics
if (!line.startsWith(PSI_AVG10_START_SOME)) {
break;
}
Matcher someMatcher = PSI_AVG10_VALUE_PATTERN_SOME.matcher(line);
if (!someMatcher.matches()) {
return -1.0F;
}
return Float.parseFloat(someMatcher.group(1));
}
}
return -1.0F;
Expand All @@ -178,6 +201,48 @@ public static float readPressureStallIndicator(String type) {
}
}

/**
* Represents a type of resource which pressure stall indicator could be collected.
*
* <p>Indicators for only this 3 types of resources are available in Linux machines.
*/
public enum PressureStallIndicatorResource {
MEMORY("memory"),
IO("io"),
CPU("cpu");

private final String resource;

PressureStallIndicatorResource(String resource) {
this.resource = resource;
}

public String getResource() {
return resource;
}
}

/**
* Represents a type of metric for pressure stall indicators. The "some" metric indicates the
* share of time in which at least some tasks are stalled on a given resource. The "full" metric
* indicates the share of time in which all non-idle tasks are stalled on a given resource
* simultaneously. (CPU full is undefined at the system level, by default always zero)
*/
public enum PressureStallIndicatorMetric {
FULL("full"),
SOME("some");

private final String metric;

PressureStallIndicatorMetric(String metric) {
this.metric = metric;
}

public String getMetric() {
return metric;
}
}

public static long getAvailableMemory() {
long availableMemory;
try {
Expand Down Expand Up @@ -301,7 +366,7 @@ public float getIoPressureLast10Sec() {
return ioPressureLast10Sec;
}

/** Returns the free physical memmory in bytes at the time of measurement. */
/** Returns the free physical memory in bytes at the time of measurement. */
public long getFreePhysicalMemory() {
return freePhysicalMemory;
}
Expand Down

0 comments on commit 9f74a1d

Please sign in to comment.