Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU list to /about/usage/system endpoint #726

Merged
merged 8 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/aleph/vm/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,11 @@

CONFIDENTIAL_SESSION_DIRECTORY: Path = Field(None, description="Default to EXECUTION_ROOT/sessions")

ENABLE_GPU_SUPPORT: bool = Field(
default=False,
description="Enable GPU pass-through support to VMs, only allowed for QEmu hypervisor",
)

# Tests on programs

FAKE_DATA_PROGRAM: Path | None = None
Expand Down Expand Up @@ -391,6 +396,8 @@
# assert check_amd_sev_snp_supported(), "SEV-SNP feature isn't enabled, enable it in BIOS"
assert self.ENABLE_QEMU_SUPPORT, "Qemu Support is needed for confidential computing and it's disabled, "
"enable it setting the env variable `ENABLE_QEMU_SUPPORT=True` in configuration"
if self.ENABLE_GPU_SUPPORT:
assert self.ENABLE_QEMU_SUPPORT, "Qemu Support is needed for GPU support and it's disabled, "

Check warning on line 400 in src/aleph/vm/conf.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/conf.py#L400

Added line #L400 was not covered by tests

def setup(self):
"""Setup the environment defined by the settings. Call this method after loading the settings."""
Expand Down
6 changes: 6 additions & 0 deletions src/aleph/vm/controllers/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class QemuVMHostVolume(BaseModel):
read_only: bool


class QemuGPU(BaseModel):
pci_host: str


class QemuVMConfiguration(BaseModel):
qemu_bin_path: str
cloud_init_drive_path: str | None
Expand All @@ -33,6 +37,7 @@ class QemuVMConfiguration(BaseModel):
mem_size_mb: int
interface_name: str | None
host_volumes: list[QemuVMHostVolume]
gpus: list[QemuGPU]


class QemuConfidentialVMConfiguration(BaseModel):
Expand All @@ -45,6 +50,7 @@ class QemuConfidentialVMConfiguration(BaseModel):
mem_size_mb: int
interface_name: str | None
host_volumes: list[QemuVMHostVolume]
gpus: list[QemuGPU]
ovmf_path: Path
sev_session_file: Path
sev_dh_cert_file: Path
Expand Down
7 changes: 6 additions & 1 deletion src/aleph/vm/controllers/qemu/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from asyncio import Task
from asyncio.subprocess import Process
from pathlib import Path
from typing import Generic, TypeVar
from typing import Generic, List, TypeVar

import psutil
from aleph_message.models import ItemHash
Expand All @@ -17,6 +17,7 @@
from aleph.vm.controllers.configuration import (
Configuration,
HypervisorType,
QemuGPU,
QemuVMConfiguration,
QemuVMHostVolume,
save_controller_configuration,
Expand All @@ -29,13 +30,16 @@
from aleph.vm.controllers.qemu.cloudinit import CloudInitMixin
from aleph.vm.network.firewall import teardown_nftables_for_vm
from aleph.vm.network.interfaces import TapInterface
from aleph.vm.resources import HostGPU
from aleph.vm.storage import get_rootfs_base_path
from aleph.vm.utils import HostNotFoundError, ping, run_in_subprocess

logger = logging.getLogger(__name__)


class AlephQemuResources(AlephFirecrackerResources):
gpus: List[HostGPU] = []

async def download_runtime(self) -> None:
volume = self.message_content.rootfs
parent_image_path = await get_rootfs_base_path(volume.parent.ref)
Expand Down Expand Up @@ -200,6 +204,7 @@ async def configure(self):
)
for volume in self.resources.volumes
],
gpus=[QemuGPU(pci_host=gpu.pci_host) for gpu in self.resources.gpus],
)

configuration = Configuration(
Expand Down
2 changes: 2 additions & 0 deletions src/aleph/vm/controllers/qemu_confidential/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Configuration,
HypervisorType,
QemuConfidentialVMConfiguration,
QemuGPU,
QemuVMHostVolume,
save_controller_configuration,
)
Expand Down Expand Up @@ -126,6 +127,7 @@ async def configure(self):
)
for volume in self.resources.volumes
],
gpus=[QemuGPU(pci_host=gpu.pci_host) for gpu in self.resources.gpus],
)

configuration = Configuration(
Expand Down
34 changes: 28 additions & 6 deletions src/aleph/vm/hypervisors/qemu/qemuvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import qmp
from systemd import journal

from aleph.vm.controllers.configuration import QemuVMConfiguration
from aleph.vm.controllers.configuration import QemuGPU, QemuVMConfiguration
from aleph.vm.controllers.qemu.instance import logger


Expand All @@ -28,6 +28,7 @@
interface_name: str
qemu_process: Process | None = None
host_volumes: list[HostVolume]
gpus: list[QemuGPU]
journal_stdout: TextIO | None
journal_stderr: TextIO | None

Expand Down Expand Up @@ -55,6 +56,7 @@
)
for volume in config.host_volumes
]
self.gpus = config.gpus

@property
def _journal_stdout_name(self) -> str:
Expand Down Expand Up @@ -113,17 +115,15 @@
# "-serial", "telnet:localhost:4321,server,nowait",
# "-snapshot", # Do not save anything to disk
]
for volume in self.host_volumes:
args += [
"-drive",
f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio",
]
if self.interface_name:
# script=no, downscript=no tell qemu not to try to set up the network itself
args += ["-net", "nic,model=virtio", "-net", f"tap,ifname={self.interface_name},script=no,downscript=no"]

if self.cloud_init_drive_path:
args += ["-cdrom", f"{self.cloud_init_drive_path}"]

args += self._get_host_volumes_args()
args += self._get_gpu_args()
print(*args)

self.qemu_process = proc = await asyncio.create_subprocess_exec(
Expand All @@ -138,6 +138,28 @@
)
return proc

def _get_host_volumes_args(self):
args = []
for volume in self.host_volumes:
args += [
"-drive",
f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio",
]
return args

def _get_gpu_args(self):
args = [
# Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
"-cpu",
"host,host-phys-bits-limit=0x28",
]
for gpu in self.gpus:
args += [

Check warning on line 157 in src/aleph/vm/hypervisors/qemu/qemuvm.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/hypervisors/qemu/qemuvm.py#L157

Added line #L157 was not covered by tests
"-device",
f"vfio-pci,host={gpu.pci_host},multifunction=on,x-vga=on",
]
return args

def _get_qmpclient(self) -> qmp.QEMUMonitorProtocol | None:
if not (self.qmp_socket_path and self.qmp_socket_path.exists()):
return None
Expand Down
12 changes: 6 additions & 6 deletions src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,24 +110,24 @@
# raise an error and prevent boot. Passing the argument --cpu host instruct the VM to use the same CPU
# model than the host thus the VM's kernel knows which method is used to get random numbers (Intel and
# AMD have different methods) and properly boot.
# Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
"-cpu",
"host",
"host,host-phys-bits-limit=0x28",
# Uncomment following for debug
# "-serial", "telnet:localhost:4321,server,nowait",
# "-snapshot", # Do not save anything to disk
]
for volume in self.host_volumes:
args += [
"-drive",
f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio",
]
if self.interface_name:
# script=no, downscript=no tell qemu not to try to set up the network itself
args += ["-net", "nic,model=virtio", "-net", f"tap,ifname={self.interface_name},script=no,downscript=no"]

if self.cloud_init_drive_path:
args += ["-cdrom", f"{self.cloud_init_drive_path}"]

args += self._get_host_volumes_args()
args += self._get_gpu_args()

Check warning on line 128 in src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py#L127-L128

Added lines #L127 - L128 were not covered by tests
print(*args)

self.qemu_process = proc = await asyncio.create_subprocess_exec(
*args,
stdin=asyncio.subprocess.DEVNULL,
Expand Down
34 changes: 30 additions & 4 deletions src/aleph/vm/models.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
import asyncio
import json
import logging
import uuid
from asyncio import Task
from collections.abc import Callable, Coroutine
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import List

from aleph_message.models import (
ExecutableContent,
InstanceContent,
ItemHash,
ProgramContent,
)
from aleph_message.models.execution.environment import HypervisorType
from aleph_message.models.execution.environment import GpuProperties, HypervisorType
from pydantic.json import pydantic_encoder

from aleph.vm.conf import settings
from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable
from aleph.vm.controllers.firecracker.instance import AlephInstanceResources
from aleph.vm.controllers.firecracker.program import (
AlephFirecrackerProgram,
AlephFirecrackerResources,
AlephProgramResources,
)
from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager
Expand All @@ -38,6 +40,7 @@
)
from aleph.vm.orchestrator.pubsub import PubSub
from aleph.vm.orchestrator.vm import AlephFirecrackerInstance
from aleph.vm.resources import GpuDevice, HostGPU
from aleph.vm.systemd import SystemDManager
from aleph.vm.utils import create_task_log_exceptions, dumps_for_json

Expand Down Expand Up @@ -69,8 +72,11 @@
vm_hash: ItemHash
original: ExecutableContent
message: ExecutableContent
resources: AlephFirecrackerResources | None = None
vm: AlephFirecrackerExecutable | AlephQemuInstance | None = None
resources: (
AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None
) = None
vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None
gpus: List[HostGPU] = []

times: VmExecutionTimes

Expand Down Expand Up @@ -202,6 +208,7 @@
resources = AlephQemuConfidentialResources(self.message, namespace=self.vm_hash)
else:
resources = AlephQemuResources(self.message, namespace=self.vm_hash)
resources.gpus = self.gpus
else:
msg = f"Unknown hypervisor type {self.hypervisor}"
raise ValueError(msg)
Expand All @@ -216,6 +223,24 @@
self.times.prepared_at = datetime.now(tz=timezone.utc)
self.resources = resources

def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None:
gpus = []

Check warning on line 227 in src/aleph/vm/models.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/models.py#L227

Added line #L227 was not covered by tests
if self.message.requirements and self.message.requirements.gpu:
for gpu in self.message.requirements.gpu:
gpu = GpuProperties.parse_obj(gpu)

Check warning on line 230 in src/aleph/vm/models.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/models.py#L230

Added line #L230 was not covered by tests
for available_gpu in available_gpus:
if available_gpu.device_id == gpu.device_id:
gpus.append(HostGPU(pci_host=available_gpu.pci_host))
break
self.gpus = gpus

Check warning on line 235 in src/aleph/vm/models.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/models.py#L233-L235

Added lines #L233 - L235 were not covered by tests

def uses_gpu(self, pci_host: str) -> bool:
for gpu in self.gpus:
if gpu.pci_host == pci_host:
return True

Check warning on line 240 in src/aleph/vm/models.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/models.py#L240

Added line #L240 was not covered by tests

return False

Check warning on line 242 in src/aleph/vm/models.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/models.py#L242

Added line #L242 was not covered by tests

def create(
self, vm_id: int, tap_interface: TapInterface | None = None, prepare: bool = True
) -> AlephVmControllerInterface:
Expand Down Expand Up @@ -437,6 +462,7 @@
message=self.message.json(),
original_message=self.original.json(),
persistent=self.persistent,
gpus=json.dumps(self.gpus, default=pydantic_encoder),
)
)

Expand Down
6 changes: 5 additions & 1 deletion src/aleph/vm/orchestrator/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,13 @@
}


class InvalidChainError(ValueError):
pass


def get_chain(chain: str) -> ChainInfo:
try:
return STREAM_CHAINS[chain]
except KeyError:
msg = f"Unknown chain id for chain {chain}"
raise ValueError(msg)
raise InvalidChainError(msg)

Check warning on line 72 in src/aleph/vm/orchestrator/chain.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/chain.py#L72

Added line #L72 was not covered by tests
7 changes: 4 additions & 3 deletions src/aleph/vm/orchestrator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,6 @@
"""Measure program performance by immediately running the supervisor
with fake requests.
"""
engine = metrics.setup_engine()
await metrics.create_tables(engine)

ref = ItemHash("cafecafecafecafecafecafecafecafecafecafecafecafecafecafecafecafe")
settings.FAKE_DATA_PROGRAM = settings.BENCHMARK_FAKE_DATA_PROGRAM

Expand Down Expand Up @@ -357,6 +354,10 @@
settings.check()

logger.debug("Initialising the DB...")
# Check and create execution database
engine = metrics.setup_engine()
asyncio.run(metrics.create_tables(engine))

Check warning on line 359 in src/aleph/vm/orchestrator/cli.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/cli.py#L358-L359

Added lines #L358 - L359 were not covered by tests
# After creating it run the DB migrations
asyncio.run(run_async_db_migrations())
logger.debug("DB up to date.")

Expand Down
2 changes: 2 additions & 0 deletions src/aleph/vm/orchestrator/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class ExecutionRecord(Base):
original_message = Column(JSON, nullable=True)
persistent = Column(Boolean, nullable=True)

gpus = Column(JSON, nullable=True)

def __repr__(self):
return f"<ExecutionRecord(uuid={self.uuid}, vm_hash={self.vm_hash}, vm_id={self.vm_id})>"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""add gpu table

Revision ID: 5c6ae643c69b
Revises: bbb12a12372e
Create Date: 2024-12-09 19:40:19.279735

"""

import sqlalchemy as sa
from alembic import op

Check warning on line 10 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L9-L10

Added lines #L9 - L10 were not covered by tests

# revision identifiers, used by Alembic.
from sqlalchemy import create_engine
from sqlalchemy.engine import reflection

Check warning on line 14 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L13-L14

Added lines #L13 - L14 were not covered by tests

from aleph.vm.conf import make_db_url

Check warning on line 16 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L16

Added line #L16 was not covered by tests

revision = "5c6ae643c69b"
down_revision = "bbb12a12372e"
branch_labels = None
depends_on = None

Check warning on line 21 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L18-L21

Added lines #L18 - L21 were not covered by tests


def upgrade() -> None:
engine = create_engine(make_db_url())
inspector = reflection.Inspector.from_engine(engine)

Check warning on line 26 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L24-L26

Added lines #L24 - L26 were not covered by tests

# The table already exists on most CRNs.
tables = inspector.get_table_names()

Check warning on line 29 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L29

Added line #L29 was not covered by tests
if "executions" in tables:
columns = inspector.get_columns("executions")
column_names = [c["name"] for c in columns]

Check warning on line 32 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L31-L32

Added lines #L31 - L32 were not covered by tests
if "gpus" not in column_names:
op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True))

Check warning on line 34 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L34

Added line #L34 was not covered by tests


def downgrade() -> None:
op.drop_column("executions", "gpus")

Check warning on line 38 in src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py#L37-L38

Added lines #L37 - L38 were not covered by tests
Loading
Loading