From e0b9a05c8ece1a1c8424eab4c1a35f03b02c5a5c Mon Sep 17 00:00:00 2001 From: jakki-amd Date: Fri, 20 Dec 2024 13:09:54 +0200 Subject: [PATCH 1/3] Add Apple system metrics support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bipradip Chowdhury Co-authored-by: Rony Leppänen Co-authored-by: Anders Smedegaard Pedersen --- .../pytorch/serve/device/utils/AppleUtil.java | 16 ++++++---------- .../java/org/pytorch/serve/ModelServerTest.java | 7 ++++--- .../serve/device/utils/AppleUtilTest.java | 6 +++--- .../pytorch/serve/util/ConfigManagerTest.java | 3 ++- ts/metrics/system_metrics.py | 14 ++++++++++++++ 5 files changed, 29 insertions(+), 17 deletions(-) diff --git a/frontend/server/src/main/java/org/pytorch/serve/device/utils/AppleUtil.java b/frontend/server/src/main/java/org/pytorch/serve/device/utils/AppleUtil.java index 3c32be3317..837045bbfc 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/device/utils/AppleUtil.java +++ b/frontend/server/src/main/java/org/pytorch/serve/device/utils/AppleUtil.java @@ -5,10 +5,9 @@ import com.google.gson.JsonObject; import com.google.gson.JsonParser; import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedHashSet; import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; import org.pytorch.serve.device.Accelerator; import org.pytorch.serve.device.AcceleratorVendor; import org.pytorch.serve.device.interfaces.IAcceleratorUtility; @@ -75,15 +74,12 @@ public List extractAccelerators(JsonElement rootObject) { .getAsJsonObject() // Gets the outer object .get("SPDisplaysDataType") // Gets the "SPDisplaysDataType" element .getAsJsonArray(); + JsonObject gpuObject = displaysArray.get(0).getAsJsonObject(); - int number_of_cores = Integer.parseInt(gpuObject.get("sppci_cores").getAsString()); - - // add the object `number_of_cores` times to maintain the exsisitng - // functionality - accelerators = - IntStream.range(0, number_of_cores) - .mapToObj(i -> gpuObject) - .collect(Collectors.toList()); + + // Create list with only a single accelerator object as + // M1, M2, M3 Macs have only single integrated GPU + accelerators = Collections.singletonList(gpuObject); return accelerators; } diff --git a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java index bd7f654ce7..cddef2a91b 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java @@ -1351,7 +1351,8 @@ public void testMetricManager() throws JsonParseException, InterruptedException Map> expectedMetrics = new HashMap<>(); expectedMetrics.put("GPUMemoryUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); expectedMetrics.put("GPUMemoryUsed", Map.of(UNIT, "Megabytes", LEVEL, HOST)); - expectedMetrics.put("GPUUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); + // torch.mps does not allow to calculate GPUUtilization, see ts/metrics/system_metrics.py + // expectedMetrics.put("GPUUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); expectedMetrics.put("CPUUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); expectedMetrics.put("MemoryUsed", Map.of(UNIT, "Megabytes", LEVEL, HOST)); expectedMetrics.put("MemoryAvailable", Map.of(UNIT, "Megabytes", LEVEL, HOST)); @@ -1372,8 +1373,8 @@ public void testMetricManager() throws JsonParseException, InterruptedException Assert.assertTrue(++count < 5); } - // 7 system-level metrics + 3 gpu-specific metrics - Assert.assertEquals(metrics.size(), 7 + 3 * configManager.getNumberOfGpu()); + // 7 system-level metrics + 2 gpu-specific metrics + Assert.assertEquals(metrics.size(), 7 + 2 * configManager.getNumberOfGpu()); for (Metric metric : metrics) { String metricName = metric.getMetricName(); diff --git a/frontend/server/src/test/java/org/pytorch/serve/device/utils/AppleUtilTest.java b/frontend/server/src/test/java/org/pytorch/serve/device/utils/AppleUtilTest.java index e333f7ec83..c52e105fc4 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/device/utils/AppleUtilTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/device/utils/AppleUtilTest.java @@ -76,7 +76,7 @@ public void testExtractAcceleratorId() { public void testExtractAccelerators() { List accelerators = appleUtil.extractAccelerators(sampleOutputJson); - assertEquals(accelerators.size(), 7); + assertEquals(accelerators.size(), 1); assertEquals(accelerators.get(0).get("sppci_model").getAsString(), "Apple M1"); } @@ -88,7 +88,7 @@ public void testSmiOutputToUpdatedAccelerators() { ArrayList updatedAccelerators = appleUtil.smiOutputToUpdatedAccelerators(sampleOutputJson.toString(), parsedGpuIds); - assertEquals(updatedAccelerators.size(), 7); + assertEquals(updatedAccelerators.size(), 1); Accelerator accelerator = updatedAccelerators.get(0); assertEquals(accelerator.getAcceleratorModel(), "Apple M1"); assertEquals(accelerator.getVendor(), AcceleratorVendor.APPLE); @@ -112,7 +112,7 @@ public String[] getUtilizationSmiCommand() { ArrayList availableAccelerators = spyAppleUtil.getAvailableAccelerators(availableAcceleratorIds); - assertEquals(availableAccelerators.size(), 7); + assertEquals(availableAccelerators.size(), 1); Accelerator accelerator = availableAccelerators.get(0); assertEquals(accelerator.getAcceleratorModel(), "Apple M1"); assertEquals(accelerator.getVendor(), AcceleratorVendor.APPLE); diff --git a/frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java b/frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java index 4616b5ae03..4b0b70acc8 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java @@ -118,7 +118,8 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException { String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False"); if (arch.equals("aarch64")) { if (mac_arm64_cpu_only.equals("True")) { - Assert.assertEquals(configManager.getNumberOfGpu(), 0); + // Mac M1 returns 1 accelerator device + Assert.assertEquals(configManager.getNumberOfGpu(), 1); } else { Assert.assertTrue(configManager.getNumberOfGpu() > 0); } diff --git a/ts/metrics/system_metrics.py b/ts/metrics/system_metrics.py index 5e69377f5a..6c2becfcda 100644 --- a/ts/metrics/system_metrics.py +++ b/ts/metrics/system_metrics.py @@ -88,6 +88,20 @@ def collect_gpu_metrics(num_of_gpus): amdsmi.amdsmi_shut_down() except amdsmi.AmdSmiException as e: logging.error("Could not shut down AMD-SMI library.") + elif torch.backends.mps.is_available(): + try: + total_memory = torch.mps.driver_allocated_memory() + mem_used = torch.mps.current_allocated_memory() + gpu_mem_utilization = ( + (mem_used / total_memory * 100) if total_memory > 0 else 0 + ) + # Currently there is no way to calculate GPU utilization with MPS. + gpu_utilization = None + except Exception as e: + logging.error(f"Could not capture MPS memory metrics") + mem_used = 0 + gpu_mem_utilization = 0 + gpu_utilization = None dimension_gpu = [ Dimension("Level", "Host"), From 1deabffb2602c113d30e72891c26e49698264aee Mon Sep 17 00:00:00 2001 From: jakki Date: Fri, 20 Dec 2024 15:50:41 +0200 Subject: [PATCH 2/3] Fix ModelServerTest.testMetricManager for other HW vendors --- .../java/org/pytorch/serve/ModelServerTest.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java index cddef2a91b..fb429cf32b 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java @@ -1373,8 +1373,17 @@ public void testMetricManager() throws JsonParseException, InterruptedException Assert.assertTrue(++count < 5); } - // 7 system-level metrics + 2 gpu-specific metrics - Assert.assertEquals(metrics.size(), 7 + 2 * configManager.getNumberOfGpu()); + // Determine if the device is Apple or not + String vendor = System.getProperty("os.name"); + if (vendor != null) { + if (vendor.startsWith("Mac")) { + // 7 system-level metrics + 2 gpu-specific metrics (per GPU) for Apple devices + Assert.assertEquals(metrics.size(), 7 + 2 * configManager.getNumberOfGpu()); + } else { + // 7 system-level metrics + 3 gpu-specific metrics (per GPU) for non-Apple devices + Assert.assertEquals(metrics.size(), 7 + 3 * configManager.getNumberOfGpu()); + } + } for (Metric metric : metrics) { String metricName = metric.getMetricName(); From e40e6c4116fc05b3fd80da8a5034ab39b49c8635 Mon Sep 17 00:00:00 2001 From: jakki Date: Fri, 20 Dec 2024 15:54:29 +0200 Subject: [PATCH 3/3] Add GPUUtilization as expect metric --- .../src/test/java/org/pytorch/serve/ModelServerTest.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java index fb429cf32b..865b232fae 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java @@ -1351,8 +1351,7 @@ public void testMetricManager() throws JsonParseException, InterruptedException Map> expectedMetrics = new HashMap<>(); expectedMetrics.put("GPUMemoryUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); expectedMetrics.put("GPUMemoryUsed", Map.of(UNIT, "Megabytes", LEVEL, HOST)); - // torch.mps does not allow to calculate GPUUtilization, see ts/metrics/system_metrics.py - // expectedMetrics.put("GPUUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); + expectedMetrics.put("GPUUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); expectedMetrics.put("CPUUtilization", Map.of(UNIT, "Percent", LEVEL, HOST)); expectedMetrics.put("MemoryUsed", Map.of(UNIT, "Megabytes", LEVEL, HOST)); expectedMetrics.put("MemoryAvailable", Map.of(UNIT, "Megabytes", LEVEL, HOST));