Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add metrics to track /messages response time by room size #13545

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13545.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Update metrics to track `/messages` response time by room size.
55 changes: 53 additions & 2 deletions synapse/rest/client/room.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
""" This module contains REST servlets to do with rooms: /rooms/<paths> """
import logging
import re
from enum import Enum
from typing import TYPE_CHECKING, Awaitable, Dict, List, Optional, Tuple
from urllib import parse as urlparse

from prometheus_client.core import Histogram

from twisted.internet import defer
from twisted.web.server import Request

from synapse import event_auth
Expand Down Expand Up @@ -49,6 +51,7 @@
)
from synapse.http.site import SynapseRequest
from synapse.logging.opentracing import set_tag
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.client._base import client_patterns
from synapse.rest.client.transactions import HttpTransactionCache
from synapse.storage.state import StateFilter
Expand All @@ -62,6 +65,33 @@

logger = logging.getLogger(__name__)


class _RoomSize(Enum):
"""
Enum to differentiate sizes of rooms. This is a pretty good aproximation
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
about how hard it will be to get events in the room. We could also look at
room "complexity".
"""

# This doesn't necessarily mean the room is a DM, just that there is a DM
# amount of people there.
DM_SIZE = "direct_message_size"
SMALL = "small"
SUBSTANTIAL = "substantial"
LARGE = "large"

@staticmethod
def from_member_count(member_count: int) -> "_RoomSize":
if member_count <= 2:
return _RoomSize.DM_SIZE
elif member_count < 100:
return _RoomSize.SMALL
elif member_count < 1000:
return _RoomSize.SUBSTANTIAL
else:
return _RoomSize.LARGE


# This is an extra metric on top of `synapse_http_server_response_time_seconds`
# which times the same sort of thing but this one allows us to see values
# greater than 10s. We use a separate dedicated histogram with its own buckets
Expand All @@ -70,7 +100,11 @@
messsages_response_timer = Histogram(
"synapse_room_message_list_rest_servlet_response_time_seconds",
"sec",
[],
# We have a label for room size so we can try to see a more realistic
# picture of /messages response time for bigger rooms. We don't want the
# tiny rooms that can always respond fast skewing our results when we're trying
# to optimize the bigger cases.
["room_size"],
buckets=(
0.005,
0.01,
Expand Down Expand Up @@ -587,14 +621,25 @@ class RoomMessageListRestServlet(RestServlet):
def __init__(self, hs: "HomeServer"):
super().__init__()
self._hs = hs
self.clock = hs.get_clock()
self.pagination_handler = hs.get_pagination_handler()
self.auth = hs.get_auth()
self.store = hs.get_datastores().main

@messsages_response_timer.time()
async def on_GET(
self, request: SynapseRequest, room_id: str
) -> Tuple[int, JsonDict]:
processing_start_time = self.clock.time_msec()
# Fire and forget and hope that we get a result by the end.
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
#
# `get_number_joined_users_in_room(...)` returns an int so we can type
# this as a `Deferred[int]` instead of an optional.
room_member_count_deferred: defer.Deferred[int] = run_as_background_process(
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
"get_number_joined_users_in_room",
self.store.get_number_joined_users_in_room,
room_id,
)

requester = await self.auth.get_user_by_req(request, allow_guest=True)
pagination_config = await PaginationConfig.from_request(
self.store, request, default_limit=10
Expand Down Expand Up @@ -625,6 +670,12 @@ async def on_GET(
event_filter=event_filter,
)

processing_end_time = self.clock.time_msec()
room_member_count = await room_member_count_deferred
messsages_response_timer.labels(
room_size=_RoomSize.from_member_count(room_member_count)
).observe((processing_start_time - processing_end_time) / 1000)

return 200, msgs


Expand Down
2 changes: 2 additions & 0 deletions synapse/storage/databases/main/roommember.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

from synapse.api.constants import EventTypes, Membership
from synapse.events import EventBase
from synapse.logging.opentracing import trace
from synapse.metrics import LaterGauge
from synapse.metrics.background_process_metrics import (
run_as_background_process,
Expand Down Expand Up @@ -396,6 +397,7 @@ def _get_room_summary_txn(
)

@cached()
@trace
Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish I could put the @trace on the outside but the cached stuff does not like getting wrapped. Same with the other caching decorators I've noticed (like @cachedList). It returns a DeferredCacheDescriptor/DeferredCacheListDescriptor instead of a wrapped function.

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1030, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
  File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 850, in exec_module
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "/Users/eric/Documents/github/element/synapse/synapse/app/homeserver.py", line 37, in <module>
    from synapse.app import _base
  File "/Users/eric/Documents/github/element/synapse/synapse/app/_base.py", line 61, in <module>
    from synapse.events.spamcheck import load_legacy_spam_checkers
  File "/Users/eric/Documents/github/element/synapse/synapse/events/spamcheck.py", line 35, in <module>
    from synapse.rest.media.v1._base import FileInfo
  File "/Users/eric/Documents/github/element/synapse/synapse/rest/__init__.py", line 18, in <module>
    from synapse.rest import admin
  File "/Users/eric/Documents/github/element/synapse/synapse/rest/admin/__init__.py", line 27, in <module>
    from synapse.rest.admin._base import admin_patterns, assert_requester_is_admin
  File "/Users/eric/Documents/github/element/synapse/synapse/rest/admin/_base.py", line 19, in <module>
    from synapse.api.auth import Auth
  File "/Users/eric/Documents/github/element/synapse/synapse/api/auth.py", line 22, in <module>
    from synapse import event_auth
  File "/Users/eric/Documents/github/element/synapse/synapse/event_auth.py", line 45, in <module>
    from synapse.storage.databases.main.events_worker import EventRedactBehaviour
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/__init__.py", line 34, in <module>
    from synapse.storage.databases import Databases
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/__init__.py", line 20, in <module>
    from synapse.storage.databases.main.events import PersistEventsStore
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/main/__init__.py", line 33, in <module>
    from .account_data import AccountDataStore
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/main/account_data.py", line 39, in <module>
    from synapse.storage.databases.main.push_rule import PushRulesWorkerStore
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/main/push_rule.py", line 41, in <module>
    from synapse.storage.databases.main.appservice import ApplicationServiceWorkerStore
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/main/appservice.py", line 35, in <module>
    from synapse.storage.databases.main.roommember import RoomMemberWorkerStore
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/main/roommember.py", line 82, in <module>
    class RoomMemberWorkerStore(EventsWorkerStore):
  File "/Users/eric/Documents/github/element/synapse/synapse/storage/databases/main/roommember.py", line 401, in RoomMemberWorkerStore
    async def get_number_joined_users_in_room(self, room_id: str) -> int:
  File "/Users/eric/Documents/github/element/synapse/synapse/logging/opentracing.py", line 954, in trace
    return trace_with_opname(func.__name__)(func)
AttributeError: 'DeferredCacheDescriptor' object has no attribute '__name__'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can make DeferredCacheDescriptor have a __name__ field to make it compatible.

That said, is there any point in tracing the cached function from the outside? If it's a cache hit then it should take negligible time and it's probably not that interesting in your trace view.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can make DeferredCacheDescriptor have a __name__ field to make it compatible.

I was exploring that option a few days ago but there is more to change to make it work.

That said, is there any point in tracing the cached function from the outside? If it's a cache hit then it should take negligible time and it's probably not that interesting in your trace view.

It's nice to have to so you know whether it's running. It's weird to just not see a call in the trace when you see it in the code.

Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this @trace call is messing up some of the tests (passes without it). The method still gives results so it's unclear to me why it would change things 🤔

https://github.com/matrix-org/synapse/runs/7891614050?check_suite_focus=true#step:7:3384

[FAIL]
Traceback (most recent call last):
  File "/home/runner/work/synapse/synapse/tests/rest/client/test_rooms.py", line 726, in test_post_room_initial_state
    self.assertEqual(50, channel.resource_usage.db_txn_count)
  File "/home/runner/.cache/pypoetry/virtualenvs/matrix-synapse-pswDeSvb-py3.9/lib/python3.9/site-packages/twisted/trial/_synctest.py", line 422, in assertEqual
    super().assertEqual(first, second, msg)
  File "/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/unittest/case.py", line 837, in assertEqual
    assertion_func(first, second, msg=msg)
  File "/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/unittest/case.py", line 830, in _baseAssertEqual
    raise self.failureException(msg)
twisted.trial.unittest.FailTest: 50 != 49

tests.rest.client.test_rooms.RoomsCreateTestCase.test_post_room_initial_state

Copy link
Contributor

@reivilibre reivilibre Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonder if it's because the function being returned by trace always has the same __name__ and so just a guess, but these two get the same cache entries. Seems like a massive trap. I'll see if I can confirm.

@cached()
@trace
def fun1(): ...

@cached()
@trace
def fun2(): ...

edit: nope, the __name__ passes through @trace like you'd hope, so it's not that. It would be good to know why this is causing a test to fail :/

Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll re-introduce this in another PR if I get an inkling to have it again ⏩

async def get_number_joined_users_in_room(self, room_id: str) -> int:
return await self.db_pool.simple_select_one_onecol(
table="current_state_events",
Expand Down