Skip to content

Commit

Permalink
[batch] make cpu and mem resource readers similar and avoid race (hai…
Browse files Browse the repository at this point in the history
…l-is#13879)

Fixes hail-is#13861. CPU monitor races with container deletion just like RAM
monitor. I also switched to catching FileNotFoundError instead of exists
since technically the file could disappear between us checking `exists`
and us `open`ing it.
  • Loading branch information
danking authored and Dan King committed Oct 23, 2023
1 parent 3e8eaec commit 2bd8cf0
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions batch/batch/resource_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,20 @@ def cpu_ns(self) -> Optional[int]:
# and here for the authoritative source:
# https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/tree/Documentation/admin-guide/cgroup-v2.rst#n1038
usage_file = f'/sys/fs/cgroup/{self.container_name}/cpu.stat'
if os.path.exists(usage_file):
try:
with open(usage_file, 'r', encoding='utf-8') as f:
for line in f.readlines():
stat, val = line.strip().split(' ')
if stat == 'usage_usec':
return int(val) * 1000
return None
return None
except FileNotFoundError:
return None
except OSError as e:
# OSError: [Errno 19] No such device
if e.errno == 19:
return None
raise

def percent_cpu_usage(self) -> Optional[float]:
now_time_ns = time_ns()
Expand All @@ -138,15 +145,15 @@ def memory_usage_bytes(self) -> Optional[int]:
# https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/tree/Documentation/admin-guide/cgroup-v2.rst#n1156
usage_file = f'/sys/fs/cgroup/{self.container_name}/memory.current'
try:
if os.path.exists(usage_file):
with open(usage_file, 'r', encoding='utf-8') as f:
return int(f.read().rstrip())
with open(usage_file, 'r', encoding='utf-8') as f:
return int(f.read().rstrip())
except FileNotFoundError:
return None
except OSError as e:
# OSError: [Errno 19] No such device
if e.errno == 19:
return None
raise
return None

def overlay_storage_usage_bytes(self) -> int:
return shutil.disk_usage(self.container_overlay).used
Expand Down

0 comments on commit 2bd8cf0

Please sign in to comment.