From 920af0faba61416eb1933237cda1830ac43f6e8e Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Thu, 18 Jun 2020 09:30:26 -0400 Subject: [PATCH] update heartbeat checks in scheduler (#3896) * update heartbeart checks in scheduler * scale heartbeart by number of workers --- distributed/scheduler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/distributed/scheduler.py b/distributed/scheduler.py index 1a5232910e9..ec9a5ef05d1 100644 --- a/distributed/scheduler.py +++ b/distributed/scheduler.py @@ -5303,7 +5303,9 @@ def _reevaluate_occupancy_worker(self, ws): async def check_worker_ttl(self): now = time() for ws in self.workers.values(): - if ws.last_seen < now - self.worker_ttl: + if (ws.last_seen < now - self.worker_ttl) and ( + 10 * heartbeat_interval(len(self.workers)) + ): logger.warning( "Worker failed to heartbeat within %s seconds. Closing: %s", self.worker_ttl, @@ -5571,7 +5573,8 @@ def heartbeat_interval(n): elif n < 200: return 2 else: - return 5 + # no more than 200 hearbeats a second scaled by workers + return n / 200 + 1 class KilledWorker(Exception):