From 9f74a1da17433d6c0c4bc43422ed189579574b35 Mon Sep 17 00:00:00 2001 From: Googler Date: Fri, 9 Feb 2024 02:18:49 -0800 Subject: [PATCH] Add more Pressure Stall Indicator(PSI) metrics to Blazel trace profiler. There are 2 types of PSI metrics: full and some. The "some" line indicates the share of time in which at least some tasks are stalled on a given resource. The "full" line indicates the share of time in which all non-idle tasks are stalled on a given resource simultaneously.There is no full PSI metric for CPU resource. In this change I've added CPU PSI metric. Also I've added stacked memory and IO metrics with split on some/full types. PiperOrigin-RevId: 605569491 Change-Id: Ia37faa60e783d8faf8e2bd614cf2899b9704cb5f --- .../profiler/CollectLocalResourceUsage.java | 100 +++++++++++++++--- .../lib/profiler/CounterSeriesTraceData.java | 14 ++- .../build/lib/profiler/ProfilerTask.java | 7 +- .../build/lib/util/ResourceUsage.java | 91 +++++++++++++--- 4 files changed, 177 insertions(+), 35 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java b/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java index fe35af1f69abb1..e684f2f97e12ce 100644 --- a/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java +++ b/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java @@ -14,6 +14,11 @@ package com.google.devtools.build.lib.profiler; import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorMetric.FULL; +import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorMetric.SOME; +import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorResource.CPU; +import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorResource.IO; +import static com.google.devtools.build.lib.util.ResourceUsage.PressureStallIndicatorResource.MEMORY; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; @@ -60,6 +65,10 @@ public class CollectLocalResourceUsage implements LocalResourceCollector { @Nullable private Map timeSeries; + @GuardedBy("this") + @Nullable + private List> stackedTaskGroups; + private Stopwatch stopwatch; private final WorkerProcessMetricsCollector workerProcessMetricsCollector; @@ -108,6 +117,7 @@ public void run() { stopwatch = Stopwatch.createStarted(); synchronized (CollectLocalResourceUsage.this) { timeSeries = new HashMap<>(); + stackedTaskGroups = new ArrayList<>(); Duration startTime = stopwatch.elapsed(); List enabledCounters = new ArrayList<>(); enabledCounters.addAll( @@ -132,8 +142,20 @@ public void run() { enabledCounters.add(ProfilerTask.CPU_USAGE_ESTIMATION); } if (collectPressureStallIndicators) { - enabledCounters.add(ProfilerTask.PRESSURE_STALL_IO); - enabledCounters.add(ProfilerTask.PRESSURE_STALL_MEMORY); + enabledCounters.add(ProfilerTask.PRESSURE_STALL_FULL_IO); + enabledCounters.add(ProfilerTask.PRESSURE_STALL_FULL_MEMORY); + enabledCounters.add(ProfilerTask.PRESSURE_STALL_SOME_IO); + enabledCounters.add(ProfilerTask.PRESSURE_STALL_SOME_MEMORY); + enabledCounters.add(ProfilerTask.PRESSURE_STALL_SOME_CPU); + + // There is no PRESSURE_STALL_FULL_CPU metric, so it's not a stacked counter. + stackedTaskGroups.add( + ImmutableList.of( + ProfilerTask.PRESSURE_STALL_FULL_IO, ProfilerTask.PRESSURE_STALL_SOME_IO)); + stackedTaskGroups.add( + ImmutableList.of( + ProfilerTask.PRESSURE_STALL_FULL_MEMORY, + ProfilerTask.PRESSURE_STALL_SOME_MEMORY)); } for (ProfilerTask counter : enabledCounters) { @@ -211,12 +233,21 @@ public void run() { if (collectLoadAverage) { loadAverage = osBean.getSystemLoadAverage(); } - double pressureStallIo = 0; - double pressureStallMemory = 0; + + // The pressure stall indicator for full CPU metric is not defined. + double pressureStallFullIo = 0; + double pressureStallFullMemory = 0; + double pressureStallSomeIo = 0; + double pressureStallSomeMemory = 0; + double pressureStallSomeCpu = 0; // The pressure stall indicators are only available on Linux. if (collectPressureStallIndicators && OS.getCurrent() == OS.LINUX) { - pressureStallIo = ResourceUsage.readPressureStallIndicator("io"); - pressureStallMemory = ResourceUsage.readPressureStallIndicator("memory"); + pressureStallFullIo = ResourceUsage.readPressureStallIndicator(IO, FULL); + pressureStallFullMemory = ResourceUsage.readPressureStallIndicator(MEMORY, FULL); + + pressureStallSomeIo = ResourceUsage.readPressureStallIndicator(IO, SOME); + pressureStallSomeMemory = ResourceUsage.readPressureStallIndicator(MEMORY, SOME); + pressureStallSomeCpu = ResourceUsage.readPressureStallIndicator(CPU, SOME); } double deltaNanos = nextElapsed.minus(previousElapsed).toNanos(); @@ -252,16 +283,31 @@ public void run() { if (loadAverage > 0) { addRange(ProfilerTask.SYSTEM_LOAD_AVERAGE, previousElapsed, nextElapsed, loadAverage); } - if (pressureStallIo >= 0) { - addRange(ProfilerTask.PRESSURE_STALL_IO, previousElapsed, nextElapsed, pressureStallIo); - } - if (pressureStallMemory >= 0) { - addRange( - ProfilerTask.PRESSURE_STALL_MEMORY, - previousElapsed, - nextElapsed, - pressureStallMemory); - } + addRange( + ProfilerTask.PRESSURE_STALL_SOME_CPU, + previousElapsed, + nextElapsed, + pressureStallSomeCpu); + addRange( + ProfilerTask.PRESSURE_STALL_SOME_IO, + previousElapsed, + nextElapsed, + pressureStallSomeIo); + addRange( + ProfilerTask.PRESSURE_STALL_FULL_IO, + previousElapsed, + nextElapsed, + pressureStallFullIo); + addRange( + ProfilerTask.PRESSURE_STALL_SOME_MEMORY, + previousElapsed, + nextElapsed, + pressureStallSomeMemory); + addRange( + ProfilerTask.PRESSURE_STALL_FULL_MEMORY, + previousElapsed, + nextElapsed, + pressureStallFullMemory); if (systemNetworkUsages != null) { addRange( ProfilerTask.SYSTEM_NETWORK_UP_USAGE, @@ -318,12 +364,34 @@ synchronized void logCollectedData() { Profiler profiler = Profiler.instance(); for (ProfilerTask task : timeSeries.keySet()) { + if (isStacked(task)) { + continue; + } profiler.logCounters( ImmutableMap.ofEntries(Map.entry(task, timeSeries.get(task).toDoubleArray(len))), profileStart, BUCKET_DURATION); } + + for (List taskGroup : stackedTaskGroups) { + ImmutableMap.Builder stackedCounters = ImmutableMap.builder(); + for (ProfilerTask task : taskGroup) { + stackedCounters.put(task, timeSeries.get(task).toDoubleArray(len)); + } + profiler.logCounters(stackedCounters.buildOrThrow(), profileStart, BUCKET_DURATION); + } + timeSeries = null; + stackedTaskGroups = null; + } + + private synchronized boolean isStacked(ProfilerTask type) { + for (List tasks : stackedTaskGroups) { + if (tasks.contains(type)) { + return true; + } + } + return false; } private void addRange( diff --git a/src/main/java/com/google/devtools/build/lib/profiler/CounterSeriesTraceData.java b/src/main/java/com/google/devtools/build/lib/profiler/CounterSeriesTraceData.java index fce0585f4c4ee9..cb0ad8bef77ec3 100644 --- a/src/main/java/com/google/devtools/build/lib/profiler/CounterSeriesTraceData.java +++ b/src/main/java/com/google/devtools/build/lib/profiler/CounterSeriesTraceData.java @@ -89,8 +89,11 @@ final class CounterSeriesTraceData implements TraceData { entry(ProfilerTask.SYSTEM_LOAD_AVERAGE, "generic_work"), entry(ProfilerTask.MEMORY_USAGE_ESTIMATION, "rail_idle"), entry(ProfilerTask.CPU_USAGE_ESTIMATION, "cq_build_attempt_passed"), - entry(ProfilerTask.PRESSURE_STALL_IO, "rail_idle"), - entry(ProfilerTask.PRESSURE_STALL_MEMORY, "rail_animation")); + entry(ProfilerTask.PRESSURE_STALL_FULL_IO, "rail_animation"), + entry(ProfilerTask.PRESSURE_STALL_SOME_IO, "cq_build_attempt_failed"), + entry(ProfilerTask.PRESSURE_STALL_FULL_MEMORY, "thread_state_unknown"), + entry(ProfilerTask.PRESSURE_STALL_SOME_MEMORY, "rail_idle"), + entry(ProfilerTask.PRESSURE_STALL_SOME_CPU, "thread_state_running")); private static final ImmutableMap COUNTER_TASK_TO_SERIES_NAME = ImmutableMap.ofEntries( @@ -106,8 +109,11 @@ final class CounterSeriesTraceData implements TraceData { entry(ProfilerTask.SYSTEM_LOAD_AVERAGE, "load"), entry(ProfilerTask.MEMORY_USAGE_ESTIMATION, "estimated memory"), entry(ProfilerTask.CPU_USAGE_ESTIMATION, "estimated cpu"), - entry(ProfilerTask.PRESSURE_STALL_IO, "i/o pressure"), - entry(ProfilerTask.PRESSURE_STALL_MEMORY, "memory pressure")); + entry(ProfilerTask.PRESSURE_STALL_FULL_IO, "i/o pressure (full)"), + entry(ProfilerTask.PRESSURE_STALL_FULL_MEMORY, "memory pressure (full)"), + entry(ProfilerTask.PRESSURE_STALL_SOME_IO, "i/o pressure (some)"), + entry(ProfilerTask.PRESSURE_STALL_SOME_MEMORY, "memory pressure (some)"), + entry(ProfilerTask.PRESSURE_STALL_SOME_CPU, "cpu pressure (some)")); @Override public void writeTraceData(JsonWriter jsonWriter, long profileStartTimeNanos) throws IOException { diff --git a/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java b/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java index 2024b49cfe2ce9..99ba600745ea04 100644 --- a/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java +++ b/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java @@ -94,8 +94,11 @@ public enum ProfilerTask { WORKER_WORKING("waiting for response from worker"), WORKER_COPYING_OUTPUTS("copying outputs from worker"), CREDENTIAL_HELPER("calling credential helper"), - PRESSURE_STALL_IO("I/O pressure stall level"), - PRESSURE_STALL_MEMORY("Memory pressure stall level"), + PRESSURE_STALL_FULL_IO("I/O pressure stall level"), + PRESSURE_STALL_FULL_MEMORY("Memory pressure stall level"), + PRESSURE_STALL_SOME_IO("I/O pressure stall level"), + PRESSURE_STALL_SOME_MEMORY("Memory pressure stall level"), + PRESSURE_STALL_SOME_CPU("CPU pressure stall level"), CONFLICT_CHECK("Conflict checking"), DYNAMIC_LOCK("Acquiring dynamic execution output lock", Threshold.FIFTY_MILLIS), REPOSITORY_FETCH("Fetching repository"), diff --git a/src/main/java/com/google/devtools/build/lib/util/ResourceUsage.java b/src/main/java/com/google/devtools/build/lib/util/ResourceUsage.java index e775b4c89a5a0d..3f8b14309e5a99 100644 --- a/src/main/java/com/google/devtools/build/lib/util/ResourceUsage.java +++ b/src/main/java/com/google/devtools/build/lib/util/ResourceUsage.java @@ -47,7 +47,12 @@ public final class ResourceUsage { private static final MemoryMXBean MEM_BEAN = ManagementFactory.getMemoryMXBean(); private static final Splitter WHITESPACE_SPLITTER = Splitter.on(CharMatcher.whitespace()); - private static final Pattern PSI_AVG10_VALUE_PATTERN = Pattern.compile("^full avg10=([\\d.]+).*"); + private static final Pattern PSI_AVG10_VALUE_PATTERN_FULL = + Pattern.compile("^full avg10=([\\d.]+).*"); + private static final Pattern PSI_AVG10_VALUE_PATTERN_SOME = + Pattern.compile("^some avg10=([\\d.]+).*"); + private static final String PSI_AVG10_START_FULL = "full avg10"; + private static final String PSI_AVG10_START_SOME = "some avg10"; private ResourceUsage() {} @@ -120,8 +125,10 @@ public static Measurement measureCurrentResourceUsage() { MEM_BEAN.getNonHeapMemoryUsage().getUsed(), MEM_BEAN.getNonHeapMemoryUsage().getCommitted(), (float) OS_BEAN.getSystemLoadAverage(), - readPressureStallIndicator("memory"), - readPressureStallIndicator("io"), + readPressureStallIndicator( + PressureStallIndicatorResource.MEMORY, PressureStallIndicatorMetric.FULL), + readPressureStallIndicator( + PressureStallIndicatorResource.IO, PressureStallIndicatorMetric.FULL), getAvailableMemory(), getCurrentCpuUtilizationInMs()); } @@ -155,21 +162,37 @@ private static long[] getCurrentCpuUtilizationInMs() { * Reads the Pressure Staller Indicator file for a given type and returns the double value for * `avg10`, or -1 if we couldn't read that value. */ - public static float readPressureStallIndicator(String type) { - String fileName = "/proc/pressure/" + type; + public static float readPressureStallIndicator( + PressureStallIndicatorResource resource, PressureStallIndicatorMetric metric) { + String fileName = "/proc/pressure/" + resource.getResource(); File procFile = new File(fileName); if (!procFile.canRead()) { return -1.0F; } try { List lines = Files.readLines(procFile, Charset.defaultCharset()); - for (String l : lines) { - if (l.startsWith("full avg10")) { - Matcher matcher = PSI_AVG10_VALUE_PATTERN.matcher(l); - if (!matcher.matches()) { - return -1.0F; - } - return Float.parseFloat(matcher.group(1)); + for (String line : lines) { + switch (metric) { + case FULL: + // Tries to find a line in file with the `full` metrics + if (!line.startsWith(PSI_AVG10_START_FULL)) { + break; + } + Matcher fullMatcher = PSI_AVG10_VALUE_PATTERN_FULL.matcher(line); + if (!fullMatcher.matches()) { + return -1.0F; + } + return Float.parseFloat(fullMatcher.group(1)); + case SOME: + // Tries to find a line in file with the `some` metrics + if (!line.startsWith(PSI_AVG10_START_SOME)) { + break; + } + Matcher someMatcher = PSI_AVG10_VALUE_PATTERN_SOME.matcher(line); + if (!someMatcher.matches()) { + return -1.0F; + } + return Float.parseFloat(someMatcher.group(1)); } } return -1.0F; @@ -178,6 +201,48 @@ public static float readPressureStallIndicator(String type) { } } + /** + * Represents a type of resource which pressure stall indicator could be collected. + * + *

Indicators for only this 3 types of resources are available in Linux machines. + */ + public enum PressureStallIndicatorResource { + MEMORY("memory"), + IO("io"), + CPU("cpu"); + + private final String resource; + + PressureStallIndicatorResource(String resource) { + this.resource = resource; + } + + public String getResource() { + return resource; + } + } + + /** + * Represents a type of metric for pressure stall indicators. The "some" metric indicates the + * share of time in which at least some tasks are stalled on a given resource. The "full" metric + * indicates the share of time in which all non-idle tasks are stalled on a given resource + * simultaneously. (CPU full is undefined at the system level, by default always zero) + */ + public enum PressureStallIndicatorMetric { + FULL("full"), + SOME("some"); + + private final String metric; + + PressureStallIndicatorMetric(String metric) { + this.metric = metric; + } + + public String getMetric() { + return metric; + } + } + public static long getAvailableMemory() { long availableMemory; try { @@ -301,7 +366,7 @@ public float getIoPressureLast10Sec() { return ioPressureLast10Sec; } - /** Returns the free physical memmory in bytes at the time of measurement. */ + /** Returns the free physical memory in bytes at the time of measurement. */ public long getFreePhysicalMemory() { return freePhysicalMemory; }