Skip to content

Commit

Permalink
Make vm_id assignment more robust (#714)
Browse files Browse the repository at this point in the history
Remove the counter way to assign a vm_id as it didn't work reliably

Jira ticket: ALEPH-272

That method was broken when persitent instances were loaded at start up.
Since the "new" feature that allow persistent instance across aleph-vm reboot
if one was started then aleph-vm was stopped and restarted the counter method
could reassign the ip and break the existing vm's.

Secundary reason was that the feature wasn't working properly with the default
settings, as `2**available_bits` returned 1. So that code path was only used if
the node owner tweaked some undocumented settings making it hard to identify and
debug in prod nodes.
  • Loading branch information
olethanh authored and Antonyjin committed Nov 19, 2024
1 parent d94ef5f commit d321d5a
Showing 1 changed file with 10 additions and 25 deletions.
35 changes: 10 additions & 25 deletions src/aleph/vm/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,13 @@


class VmPool:
"""Pool of VMs already started and used to decrease response time.
"""Pool of existing VMs
For function VM we keep the VM a while after they have run, so we can reuse them and thus decrease response time.
After running, a VM is saved for future reuse from the same function during a
configurable duration.
The counter is used by the VMs to set their tap interface name and the corresponding
IPv4 subnet.
"""

counter: int # Used to provide distinct ids to network interfaces
executions: dict[ItemHash, VmExecution]
message_cache: dict[str, ExecutableMessage]
network: Network | None
Expand All @@ -45,7 +43,6 @@ class VmPool:
creation_lock: asyncio.Lock

def __init__(self, loop: asyncio.AbstractEventLoop):
self.counter = settings.START_ID_INDEX
self.executions = {}
self.message_cache = {}

Expand Down Expand Up @@ -150,25 +147,13 @@ def get_unique_vm_id(self) -> int:
This identifier is used to name the network interface and in the IPv4 range
dedicated to the VM.
"""
_, network_range = settings.IPV4_ADDRESS_POOL.split("/")
available_bits = int(network_range) - settings.IPV4_NETWORK_PREFIX_LENGTH
self.counter += 1
if self.counter < 2**available_bits:
# In common cases, use the counter itself as the vm_id. This makes it
# easier to debug.
return self.counter
else:
# The value of the counter is too high and some functions such as the
# IPv4 range dedicated to the VM do not support such high values.
#
# We therefore recycle vm_id values from executions that are not running
# anymore.
currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
for i in range(settings.START_ID_INDEX, 255**2):
if i not in currently_used_vm_ids:
return i
msg = "No available value for vm_id."
raise ValueError(msg)
# Take the first id that is not already taken
currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
for i in range(settings.START_ID_INDEX, 255**2):
if i not in currently_used_vm_ids:
return i
msg = "No available value for vm_id."
raise ValueError(msg)

def get_running_vm(self, vm_hash: ItemHash) -> VmExecution | None:
"""Return a running VM or None. Disables the VM expiration task."""
Expand Down

0 comments on commit d321d5a

Please sign in to comment.