diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index d802817e..d7668c5f 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -12,7 +12,7 @@ from aleph.vm.conf import settings from aleph.vm.pool import VmPool -from aleph.vm.resources import GpuProperties +from aleph.vm.resources import GpuDevice from aleph.vm.sevclient import SevClient from aleph.vm.utils import ( check_amd_sev_es_supported, @@ -74,8 +74,11 @@ class UsagePeriod(BaseModel): class MachineProperties(BaseModel): cpu: CpuProperties - gpu: Optional[List[GpuProperties]] - available_gpus: Optional[List[GpuProperties]] + + +class GpuProperties(BaseModel): + devices: Optional[List[GpuDevice]] + available_devices: Optional[List[GpuDevice]] class MachineUsage(BaseModel): @@ -84,20 +87,30 @@ class MachineUsage(BaseModel): disk: DiskUsage period: UsagePeriod properties: MachineProperties + gpu: GpuProperties active: bool = True +def get_machine_gpus(request: web.Request) -> GpuProperties: + pool: VmPool = request.app["vm_pool"] + gpus = pool.gpus + available_gpus = pool.get_available_gpus() + + return GpuProperties( + devices=gpus, + available_devices=available_gpus, + ) + + @lru_cache -def get_machine_properties(request: web.Request) -> MachineProperties: +def get_machine_properties(_: web.Request) -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ cpu_info = cpuinfo.get_cpu_info() # Slow - pool: VmPool = request.app["vm_pool"] - gpus = pool.gpus - available_gpus = pool.get_available_gpus() + return MachineProperties( cpu=CpuProperties( architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")), @@ -113,8 +126,6 @@ def get_machine_properties(request: web.Request) -> MachineProperties: ) ), ), - gpu=gpus, - available_gpus=available_gpus, ) @@ -122,6 +133,7 @@ def get_machine_properties(request: web.Request) -> MachineProperties: async def about_system_usage(request: web.Request): """Public endpoint to expose information about the system usage.""" period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0) + machine_properties = get_machine_properties(request) usage: MachineUsage = MachineUsage( cpu=CpuUsage( @@ -141,7 +153,8 @@ async def about_system_usage(request: web.Request): start_timestamp=period_start, duration_seconds=60, ), - properties=get_machine_properties(request), + properties=machine_properties, + gpu=get_machine_gpus(request) ) return web.json_response(text=usage.json(exclude_none=True)) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 59d459da..3eb33daf 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -19,7 +19,7 @@ from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager from aleph.vm.network.hostnetwork import Network, make_ipv6_allocator from aleph.vm.orchestrator.metrics import get_execution_records -from aleph.vm.resources import GpuProperties, get_gpu_info +from aleph.vm.resources import GpuDevice, get_gpu_devices from aleph.vm.systemd import SystemDManager from aleph.vm.utils import get_message_executable_content from aleph.vm.vm_type import VmType @@ -43,7 +43,7 @@ class VmPool: snapshot_manager: SnapshotManager | None = None systemd_manager: SystemDManager creation_lock: asyncio.Lock - gpus: List[GpuProperties] = [] + gpus: List[GpuDevice] = [] def __init__(self, loop: asyncio.AbstractEventLoop): self.executions = {} @@ -83,7 +83,7 @@ def setup(self) -> None: if settings.ENABLE_GPU_SUPPORT: logger.debug("Detecting GPU devices ...") - self.available_gpus = get_gpu_info() + self.available_gpus = get_gpu_devices() def teardown(self) -> None: """Stop the VM pool and the network properly.""" @@ -288,7 +288,8 @@ def get_instance_executions(self) -> Iterable[VmExecution]: ) return executions or [] - def get_available_gpus(self) -> Iterable[GpuProperties]: + def get_available_gpus(self) -> Iterable[GpuDevice]: + # TODO: Filter already used GPUs on current executions and remove it from available available_gpus = self.available_gpus return available_gpus or [] diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py index 7481ab7f..e2f3d8b2 100644 --- a/src/aleph/vm/resources.py +++ b/src/aleph/vm/resources.py @@ -2,7 +2,8 @@ from enum import Enum from typing import List, Optional -from pydantic import BaseModel, Extra, Field +from aleph_message.models import HashableModel +from pydantic import Extra, Field class GpuDeviceClass(str, Enum): @@ -10,7 +11,7 @@ class GpuDeviceClass(str, Enum): _3D_CONTROLLER = "0302" -class GpuProperties(BaseModel): +class GpuDevice(HashableModel): """GPU properties.""" vendor: str = Field(description="GPU vendor name") @@ -56,7 +57,7 @@ def is_kernel_enabled_gpu(pci_host: str) -> bool: return False -def parse_gpu_device_info(line: str) -> Optional[GpuProperties]: +def parse_gpu_device_info(line: str) -> Optional[GpuDevice]: """Parse GPU device info from a line of lspci output.""" pci_host, device = line.split(' "', maxsplit=1) @@ -80,7 +81,7 @@ def parse_gpu_device_info(line: str) -> Optional[GpuProperties]: model_id = model_id[:-1] device_id = f"{vendor_id}:{model_id}" - return GpuProperties( + return GpuDevice( pci_host=pci_host, vendor=vendor_name, device_name=device_name, @@ -89,7 +90,7 @@ def parse_gpu_device_info(line: str) -> Optional[GpuProperties]: ) -def get_gpu_info() -> Optional[List[GpuProperties]]: +def get_gpu_devices() -> Optional[List[GpuDevice]]: """Get GPU info using lspci command.""" result = subprocess.run(["lspci", "-mmnnn"], capture_output=True, text=True, check=True)