Skip to content

Commit

Permalink
Add GPU list to /about/usage/system
Browse files Browse the repository at this point in the history
  • Loading branch information
philogicae committed Nov 27, 2024
1 parent 6836d6d commit f37cc01
Showing 1 changed file with 55 additions and 1 deletion.
56 changes: 55 additions & 1 deletion src/aleph/vm/orchestrator/resources.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import math
import subprocess
from datetime import datetime, timezone
from functools import lru_cache
from typing import List, Optional

import cpuinfo
import psutil
from aiohttp import web
from aleph_message.models import ItemHash
from aleph_message.models.abstract import HashableModel
from aleph_message.models.execution.environment import CpuProperties
from pydantic import BaseModel, Field
from pydantic import BaseModel, Extra, Field

from aleph.vm.conf import settings
from aleph.vm.sevclient import SevClient
Expand Down Expand Up @@ -69,8 +72,23 @@ class UsagePeriod(BaseModel):
duration_seconds: float


class GpuProperties(HashableModel):
"""GPU properties."""

domain_bus_device_func: str = Field(description="Domain, bus, device, and function numbers")
class_name: str = Field(description="Class of the device")
class_code: str = Field(description="Class code of the device")
name: str = Field(description="Name of the device (vendor and model)")
vendor_and_device_id: str = Field(description="Vendor ID and device ID")
revision: Optional[str] = Field(default=None, description="Revision number of the device")

class Config:
extra = Extra.forbid


class MachineProperties(BaseModel):
cpu: CpuProperties
gpu: Optional[List[GpuProperties]]


class MachineUsage(BaseModel):
Expand All @@ -82,6 +100,40 @@ class MachineUsage(BaseModel):
active: bool = True


LSPCI_COMMAND = ["lspci", "-nnn"]
VALID_CLASS_CODES = {"0300", "0302"}


def parse_device_info(line) -> GpuProperties:
"""Parse device info from a line of lspci output."""

domain_bus_device_func, device = line.split(" ", maxsplit=1)
device_class, device_info = device.split(": ", maxsplit=1)
device_class_name, device_class_code = device_class.split(" [", maxsplit=1)
device_class_code = device_class_code.split("]", maxsplit=1)[0]
device_name, device_details = device_info.split(" [", maxsplit=1)
vendor_and_device_id, extra = device_details.split("]", maxsplit=1)
revision = extra.split(" (rev ", maxsplit=1)[1][:-1] if "rev" in extra else None

return GpuProperties(
domain_bus_device_func=domain_bus_device_func,
class_name=device_class_name,
class_code=device_class_code,
name=device_name,
vendor_and_device_id=vendor_and_device_id,
revision=revision,
)


def get_gpu_info() -> Optional[List[GpuProperties]]:
"""Get GPU info using lspci command."""

result = subprocess.run(LSPCI_COMMAND, capture_output=True, text=True, check=True)
pci_devices = [parse_device_info(line) for line in result.stdout.split("\n") if line]
gpu_devices = [device for device in pci_devices if device.class_code in VALID_CLASS_CODES]
return gpu_devices if gpu_devices else None


@lru_cache
def get_machine_properties() -> MachineProperties:
"""Fetch machine properties such as architecture, CPU vendor, ...
Expand All @@ -90,6 +142,7 @@ def get_machine_properties() -> MachineProperties:
In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow
gpu_info = get_gpu_info()
return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
Expand All @@ -105,6 +158,7 @@ def get_machine_properties() -> MachineProperties:
)
),
),
gpu=gpu_info,
)


Expand Down

0 comments on commit f37cc01

Please sign in to comment.