Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Time how long it takes us to do backfill processing #13535

Merged
merged 4 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13535.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add metrics to time how long it takes us to do backfill processing (`synapse_federation_backfill_processing_before_time_seconds`, `synapse_federation_backfill_processing_after_time_seconds`).
26 changes: 26 additions & 0 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
)

import attr
from prometheus_client import Histogram
from signedjson.key import decode_verify_key_bytes
from signedjson.sign import verify_signed_json
from unpaddedbase64 import decode_base64
Expand Down Expand Up @@ -80,6 +81,24 @@
logger = logging.getLogger(__name__)


backfill_processing_before_timer = Histogram(
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
"synapse_federation_backfill_processing_before_time_seconds",
"sec",
[],
buckets=(
1.0,
5.0,
10.0,
20.0,
30.0,
40.0,
60.0,
80.0,
"+Inf",
),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will probably want to adjust the fidelity in the buckets once we have some real data.

I've seen the linearizer lock take 45s and calculating the likely_domains take 20s for example

)


def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]:
"""Get joined domains from state

Expand Down Expand Up @@ -138,6 +157,7 @@ class FederationHandler:
def __init__(self, hs: "HomeServer"):
self.hs = hs

self.clock = hs.get_clock()
self.store = hs.get_datastores().main
self._storage_controllers = hs.get_storage_controllers()
self._state_storage_controller = self._storage_controllers.state
Expand Down Expand Up @@ -203,6 +223,7 @@ async def maybe_backfill(
async def _maybe_backfill_inner(
self, room_id: str, current_depth: int, limit: int
) -> bool:
processing_start_time = self.clock.time_msec()
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
backwards_extremities = [
_BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY)
for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room(
Expand Down Expand Up @@ -425,6 +446,11 @@ async def try_backfill(domains: List[str]) -> bool:

return False

processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_start_time - processing_end_time) / 1000
)
Comment on lines +475 to +478
Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same problem as #13533 (comment)

I don't think this is working how we expect. All of the buckets just have the same values which means it's saying every record is taking less than 1.0s. It's possible. Did we get the ms to s conversion wrong?

https://prometheus.matrix.org/graph?g0.expr=synapse_federation_backfill_processing_before_time_seconds_bucket&g0.tab=1&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g0.end_input=2022-08-18%2020%3A36%3A02&g0.moment_input=2022-08-18%2020%3A36%3A02

synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="+Inf", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="1.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="10.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="20.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="30.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="40.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="5.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="60.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="80.0", service="synapse"}	7759

So the graph of the percentiles is just average of all of the buckets below it and we get straight lines, https://grafana.matrix.org/d/dYoRgTgVz/messages-timing?orgId=1&from=1660812148100&to=1660855348100&viewPanel=212


The after timing one looks normal:

https://grafana.matrix.org/d/dYoRgTgVz/messages-timing?orgId=1&from=1660812186785&to=1660855386785&viewPanel=213

Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we get the ms to s conversion wrong?

I don't think so. It gives values like 2.196 when I log it locally.

And we do the same thing for existing metrics which look fine, ex.

).observe((now - ts) / 1000)

Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The values in Prometheus are negative: https://prometheus.matrix.org/graph?g0.expr=synapse_federation_backfill_processing_before_time_seconds_sum&g0.tab=1&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g0.end_input=2022-08-22%2017%3A57%3A40&g0.moment_input=2022-08-22%2017%3A57%3A40

Spotted 🕵️‍♀️ I accidentally did start - end 🤦‍♂️

Corrected:

Suggested change
processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_start_time - processing_end_time) / 1000
)
processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_end_time - processing_start_time) / 1000
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in #13584


success = await try_backfill(likely_domains)
if success:
return True
Expand Down
48 changes: 34 additions & 14 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
Tuple,
)

from prometheus_client import Counter
from prometheus_client import Counter, Histogram

from synapse import event_auth
from synapse.api.constants import (
Expand Down Expand Up @@ -92,6 +92,25 @@
"Events received over federation that we marked as soft_failed",
)

backfill_processing_after_timer = Histogram(
"synapse_federation_backfill_processing_after_time_seconds",
"sec",
[],
buckets=(
1.0,
5.0,
10.0,
20.0,
30.0,
40.0,
60.0,
80.0,
120.0,
180.0,
"+Inf",
),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The after processing can take a while depending on how slow /state_ids is.

ex. _process_pulled_events taking 83s

)


class FederationEventHandler:
"""Handles events that originated from federation.
Expand Down Expand Up @@ -597,20 +616,21 @@ async def backfill(
if not events:
return

# if there are any events in the wrong room, the remote server is buggy and
# should not be trusted.
for ev in events:
if ev.room_id != room_id:
raise InvalidResponseError(
f"Remote server {dest} returned event {ev.event_id} which is in "
f"room {ev.room_id}, when we were backfilling in {room_id}"
)
with backfill_processing_after_timer.time():
# if there are any events in the wrong room, the remote server is buggy and
# should not be trusted.
for ev in events:
if ev.room_id != room_id:
raise InvalidResponseError(
f"Remote server {dest} returned event {ev.event_id} which is in "
f"room {ev.room_id}, when we were backfilling in {room_id}"
)

await self._process_pulled_events(
dest,
events,
backfilled=True,
)
await self._process_pulled_events(
dest,
events,
backfilled=True,
)

@trace
async def _get_missing_events_for_pdu(
Expand Down