From 22c5a705077e00f02a54edcbd04e4edb30a98b53 Mon Sep 17 00:00:00 2001 From: Hugo Herter Date: Tue, 19 Mar 2024 16:54:36 +0100 Subject: [PATCH] Fix: Interface being busy prevented instance creation When attempting to schedule a Firecracker instance, the error below came in a loop. Solution: Log a warning instead of raising an exception. ``` 2024-03-19 15:45:49,346 | ERROR | File "", line 198, in _run_module_as_main File "", line 88, in _run_code File "/opt/aleph-vm/aleph/vm/orchestrator/__main__.py", line 4, in main() File "/opt/aleph-vm/aleph/vm/orchestrator/cli.py", line 371, in main supervisor.run() File "/opt/aleph-vm/aleph/vm/orchestrator/supervisor.py", line 163, in run web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT) File "/opt/aleph-vm/aiohttp/web.py", line 544, in run_app loop.run_until_complete(main_task) File "/usr/lib/python3.11/asyncio/base_events.py", line 640, in run_until_complete self.run_forever() File "/usr/lib/python3.11/asyncio/base_events.py", line 607, in run_forever self._run_once() File "/usr/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once handle._run() File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run self._context.run(self._callback, *self._args) File "/opt/aleph-vm/aiohttp/web_protocol.py", line 452, in _handle_request resp = await request_handler(request) File "/opt/aleph-vm/sentry_sdk/integrations/aiohttp.py", line 129, in sentry_app_handle response = await old_handle(self, request) File "/opt/aleph-vm/aiohttp/web_app.py", line 543, in _handle resp = await handler(request) File "/opt/aleph-vm/aiohttp/web_middlewares.py", line 114, in impl return await handler(request) File "/opt/aleph-vm/aleph/vm/orchestrator/supervisor.py", line 65, in server_version_middleware resp: web.StreamResponse = await handler(request) File "/opt/aleph-vm/aiohttp/web_urldispatcher.py", line 200, in handler_wrapper result = await result File "/opt/aleph-vm/aleph/vm/orchestrator/run.py", line 129, in run_code_on_request execution = await create_vm_execution_or_raise_http_error(vm_hash=vm_hash, pool=pool) File "/opt/aleph-vm/aleph/vm/orchestrator/run.py", line 90, in create_vm_execution_or_raise_http_error return await create_vm_execution(vm_hash=vm_hash, pool=pool) File "/opt/aleph-vm/aleph/vm/orchestrator/run.py", line 60, in create_vm_execution execution = await pool.create_a_vm( File "/opt/aleph-vm/aleph/vm/pool.py", line 113, in create_a_vm await self.network.create_tap(vm_id, tap_interface) File "/opt/aleph-vm/aleph/vm/network/hostnetwork.py", line 221, in create_tap await interface.create() File "/opt/aleph-vm/aleph/vm/network/interfaces.py", line 128, in create create_tap_interface(ipr, self.device_name) File "/opt/aleph-vm/aleph/vm/network/interfaces.py", line 32, in create_tap_interface ipr.link("add", ifname=device_name, kind="tuntap", mode="tap") File "/opt/aleph-vm/pyroute2/iproute/linux.py", line 1696, in link ret = self.nlm_request(msg, msg_type=msg_type, msg_flags=msg_flags) File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 870, in nlm_request return tuple(self._genlm_request(*argv, **kwarg)) File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 1209, in nlm_request self.put(msg, msg_type, msg_flags, msg_seq=msg_seq) File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 906, in put return self.engine.put( File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 443, in put self.socket.sendto_gate(msg, addr) File "/opt/aleph-vm/pyroute2/netlink/rtnl/iprsocket.py", line 52, in sendto_gate ret = self._sproxy.handle(msg) File "/opt/aleph-vm/pyroute2/netlink/proxy.py", line 61, in handle log.error(''.join(traceback.format_stack())) 2024-03-19 15:45:49,353 | ERROR | Traceback (most recent call last): File "/opt/aleph-vm/pyroute2/netlink/proxy.py", line 43, in handle ret = plugin(msg, self.nl) ^^^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/rtnl/ifinfmsg/proxy.py", line 73, in proxy_newlink return manage_tuntap(msg) ^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/rtnl/ifinfmsg/sync.py", line 60, in decorated ret = f(msg) ^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/rtnl/ifinfmsg/tuntap.py", line 135, in manage_tuntap ioctl(fd, TUNSETIFF, ifr) OSError: [Errno 16] Device or resource busy 2024-03-19 15:45:49,356 | ERROR | Interface vmtap4 is busy - is there another process using it ? Traceback (most recent call last): File "/opt/aleph-vm/aleph/vm/network/interfaces.py", line 32, in create_tap_interface ipr.link("add", ifname=device_name, kind="tuntap", mode="tap") File "/opt/aleph-vm/pyroute2/iproute/linux.py", line 1696, in link ret = self.nlm_request(msg, msg_type=msg_type, msg_flags=msg_flags) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 870, in nlm_request return tuple(self._genlm_request(*argv, **kwarg)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 1214, in nlm_request for msg in self.get( ^^^^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 873, in get return tuple(self._genlm_get(*argv, **kwarg)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/pyroute2/netlink/nlsocket.py", line 550, in get raise msg['header']['error'] pyroute2.netlink.exceptions.NetlinkError: (16, 'Device or resource busy') The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/opt/aleph-vm/aleph/vm/orchestrator/run.py", line 90, in create_vm_execution_or_raise_http_error return await create_vm_execution(vm_hash=vm_hash, pool=pool) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/aleph/vm/orchestrator/run.py", line 60, in create_vm_execution execution = await pool.create_a_vm( ^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/aleph-vm/aleph/vm/pool.py", line 113, in create_a_vm await self.network.create_tap(vm_id, tap_interface) File "/opt/aleph-vm/aleph/vm/network/hostnetwork.py", line 221, in create_tap await interface.create() File "/opt/aleph-vm/aleph/vm/network/interfaces.py", line 128, in create create_tap_interface(ipr, self.device_name) File "/opt/aleph-vm/aleph/vm/network/interfaces.py", line 37, in create_tap_interface raise InterfaceBusyError( aleph.vm.network.interfaces.InterfaceBusyError: Interface vmtap4 is busy - is there another process using it ? 2024-03-19 15:45:49,362 | INFO | 127.0.0.1 [19/Mar/2024:15:45:30 +0000] "GET /vm/3fc0aa9569da840c43e7bd2033c3c580abb4 ``` --- src/aleph/vm/network/interfaces.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/aleph/vm/network/interfaces.py b/src/aleph/vm/network/interfaces.py index fb9d10d7f..6da33db11 100644 --- a/src/aleph/vm/network/interfaces.py +++ b/src/aleph/vm/network/interfaces.py @@ -34,14 +34,12 @@ def create_tap_interface(ipr: IPRoute, device_name: str): if error.code == 17: logger.warning(f"Interface {device_name} already exists") elif error.code == 16: - raise InterfaceBusyError( - f"Interface {device_name} is busy - is there another process using it ?" - ) from error + logger.warning(f"Interface {device_name} is busy - is there another process using it ?") else: raise except OSError as error: if error.errno == errno.EBUSY: - raise InterfaceBusyError(f"Interface {device_name} is busy. Is another process using it ?") from error + logger.warning(f"Interface {device_name} is busy - is there another process using it ?") def add_ip_address(ipr: IPRoute, device_name: str, ip: Union[IPv4Interface, IPv6Interface]):