Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Cleanups and sanity-checking in cpu and db metrics #5499

Merged
merged 7 commits into from
Jun 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/5499.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some cleanups and sanity-checking in the CPU and database metrics.
35 changes: 13 additions & 22 deletions synapse/storage/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,22 +167,22 @@ def __init__(self):
self.current_counters = {}
self.previous_counters = {}

def update(self, key, start_time, end_time=None):
if end_time is None:
end_time = time.time()
duration = end_time - start_time
def update(self, key, duration_secs):
count, cum_time = self.current_counters.get(key, (0, 0))
count += 1
cum_time += duration
cum_time += duration_secs
self.current_counters[key] = (count, cum_time)
return end_time

def interval(self, interval_duration, limit=3):
def interval(self, interval_duration_secs, limit=3):
counters = []
for name, (count, cum_time) in iteritems(self.current_counters):
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
counters.append(
((cum_time - prev_time) / interval_duration, count - prev_count, name)
(
(cum_time - prev_time) / interval_duration_secs,
count - prev_count,
name,
)
)

self.previous_counters = dict(self.current_counters)
Expand Down Expand Up @@ -213,7 +213,6 @@ def __init__(self, db_conn, hs):
# is running in mainline, and we have some nice monitoring frontends
# to watch it
self._txn_perf_counters = PerformanceCounters()
self._get_event_counters = PerformanceCounters()

self._get_event_cache = Cache(
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
Expand Down Expand Up @@ -361,21 +360,13 @@ def loop():
time_then = self._previous_loop_ts
self._previous_loop_ts = time_now

ratio = (curr - prev) / (time_now - time_then)

top_three_counters = self._txn_perf_counters.interval(
time_now - time_then, limit=3
)
duration = time_now - time_then
ratio = (curr - prev) / duration

top_3_event_counters = self._get_event_counters.interval(
time_now - time_then, limit=3
)
top_three_counters = self._txn_perf_counters.interval(duration, limit=3)

perf_logger.info(
"Total database time: %.3f%% {%s} {%s}",
ratio * 100,
top_three_counters,
top_3_event_counters,
"Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
)

self._clock.looping_call(loop, 10000)
Expand Down Expand Up @@ -457,7 +448,7 @@ def _new_transaction(
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)

self._current_txn_total_time += duration
self._txn_perf_counters.update(desc, start, end)
self._txn_perf_counters.update(desc, duration)
sql_txn_timer.labels(desc).observe(duration)

@defer.inlineCallbacks
Expand Down
46 changes: 39 additions & 7 deletions synapse/util/logcontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,9 @@ def stop(self):
logger.warning("Called stop on logcontext %s without calling start", self)
return

usage_end = get_thread_resource_usage()

self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
self._resource_usage.ru_utime += utime_delta
self._resource_usage.ru_stime += stime_delta

self.usage_start = None

Expand All @@ -357,13 +356,44 @@ def get_resource_usage(self):
# can include resource usage so far.
is_main_thread = threading.current_thread() is self.main_thread
if self.alive and self.usage_start and is_main_thread:
current = get_thread_resource_usage()
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
res.ru_utime += utime_delta
res.ru_stime += stime_delta

return res

def _get_cputime(self):
"""Get the cpu usage time so far

Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
"""
current = get_thread_resource_usage()

utime_delta = current.ru_utime - self.usage_start.ru_utime
stime_delta = current.ru_stime - self.usage_start.ru_stime

# sanity check
if utime_delta < 0:
logger.error(
"utime went backwards! %f < %f",
current.ru_utime,
self.usage_start.ru_utime,
)
utime_delta = 0

if stime_delta < 0:
logger.error(
"stime went backwards! %f < %f",
current.ru_stime,
self.usage_start.ru_stime,
)
stime_delta = 0

return utime_delta, stime_delta

def add_database_transaction(self, duration_sec):
if duration_sec < 0:
raise ValueError("DB txn time can only be non-negative")
self._resource_usage.db_txn_count += 1
self._resource_usage.db_txn_duration_sec += duration_sec

Expand All @@ -374,6 +404,8 @@ def add_database_scheduled(self, sched_sec):
sched_sec (float): number of seconds it took us to get a
connection
"""
if sched_sec < 0:
raise ValueError("DB scheduling time can only be non-negative")
self._resource_usage.db_sched_duration_sec += sched_sec

def record_event_fetch(self, event_count):
Expand Down