From 76c68972e8dbf0ded7318758ecbef0f355b4eaa0 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 14 Jan 2025 17:37:23 +0100 Subject: [PATCH] Fix guest-api ConnectionClosedError: Reader at end of file Sentry Issue: ALEPH-VM-STAGING-41 Jira Issue: ALEPH-353 This error was making the diagnostic down constently, raising 3K error in 48h on Sentry In aleph.vm.guest_api._main_.put_in_cache ``` ConnectionClosedError: Reader at end of file File "aiohttp/web_app.py", line 569, in _handle return await handler(request) File "aleph/vm/guest_api/__main__.py", line 128, in put_in_cache return web.json_response(await redis.set(f"{prefix}:{key}", value, expire=CACHE_EXPIRES_AFTER)) ``` *Investigation* The error started at Jan 12, 2025 7:26:47 AM CET The redis server was restarted around the same time by the server unattended-upgrades (apt) *Analysis* The guest api for the diagnostic VM lost the connexion to the redis server (via unix connexion) when it was restarted. Since the guest api always reuse the same connexion the error was always triggered. In addition as the diagnostic vm is called regularly by monitoring services, it doesn't timeout and stop, so the init process that establish the redis connection was never redone *Solution* Check if the redis connection is still ok by pinging the service, if it raise an error, create a new connection *How to test* Start CRN, call the diagnostic vm redis endpoint http://localhost:4020/vm/63faf8b5db1cf8d965e6a464a0cb8062af8e7df131729e48738342d956f29ace/cache/get/a Then restart the redis service on the CRN ```bash systemctl restart redis ``` and call the diagnostic vm redis ndpoint again --- src/aleph/vm/guest_api/__main__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/aleph/vm/guest_api/__main__.py b/src/aleph/vm/guest_api/__main__.py index 8000d52b..e5b89ebe 100644 --- a/src/aleph/vm/guest_api/__main__.py +++ b/src/aleph/vm/guest_api/__main__.py @@ -24,8 +24,15 @@ async def get_redis(address: str = REDIS_ADDRESS) -> aioredis.Redis: global _redis - if _redis is None: + # Ensure the redis connection is still up before returning it + if _redis: + try: + await _redis.ping() + except aioredis.ConnectionClosedError: + _redis = None + if not _redis: _redis = await aioredis.create_redis(address=address) + return _redis