Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add metrics to track success/otherwise of replication requests #8406

Merged
merged 5 commits into from
Sep 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/8406.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add prometheus metrics for replication requests.
40 changes: 28 additions & 12 deletions synapse/replication/http/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,28 @@
from inspect import signature
from typing import Dict, List, Tuple

from synapse.api.errors import (
CodeMessageException,
HttpResponseException,
RequestSendFailed,
SynapseError,
)
from prometheus_client import Counter, Gauge

from synapse.api.errors import HttpResponseException, SynapseError
from synapse.http import RequestTimedOutError
from synapse.logging.opentracing import inject_active_span_byte_dict, trace
from synapse.util.caches.response_cache import ResponseCache
from synapse.util.stringutils import random_string

logger = logging.getLogger(__name__)

_pending_outgoing_requests = Gauge(
"synapse_pending_outgoing_replication_requests",
"Number of active outgoing replication requests, by replication method name",
["name"],
)

_outgoing_request_counter = Counter(
"synapse_outgoing_replication_requests",
"Number of outgoing replication requests, by replication method name and result",
["name", "code"],
)


class ReplicationEndpoint(metaclass=abc.ABCMeta):
"""Helper base class for defining new replication HTTP endpoints.
Expand Down Expand Up @@ -138,7 +148,10 @@ def make_client(cls, hs):

instance_map = hs.config.worker.instance_map

outgoing_gauge = _pending_outgoing_requests.labels(cls.NAME)

@trace(opname="outgoing_replication_request")
@outgoing_gauge.track_inprogress()
async def send_request(instance_name="master", **kwargs):
if instance_name == local_instance_name:
raise Exception("Trying to send HTTP request to self")
Expand Down Expand Up @@ -193,23 +206,26 @@ async def send_request(instance_name="master", **kwargs):
try:
result = await request_func(uri, data, headers=headers)
break
except CodeMessageException as e:
if e.code != 504 or not cls.RETRY_ON_TIMEOUT:
except RequestTimedOutError:
if not cls.RETRY_ON_TIMEOUT:
raise

logger.warning("%s request timed out", cls.NAME)
logger.warning("%s request timed out; retrying", cls.NAME)

# If we timed out we probably don't need to worry about backing
# off too much, but lets just wait a little anyway.
await clock.sleep(1)
except HttpResponseException as e:
# We convert to SynapseError as we know that it was a SynapseError
# on the master process that we should send to the client. (And
# on the main process that we should send to the client. (And
# importantly, not stack traces everywhere)
_outgoing_request_counter.labels(cls.NAME, e.code).inc()
raise e.to_synapse_error()
except RequestSendFailed as e:
raise SynapseError(502, "Failed to talk to master") from e
except Exception as e:
_outgoing_request_counter.labels(cls.NAME, "ERR").inc()
raise SynapseError(502, "Failed to talk to main process") from e

_outgoing_request_counter.labels(cls.NAME, 200).inc()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've never used 200 as a label before, but I suppose this is just to differentiate between the other groups. I assume we can't just leave out the 200 here as otherwise we wouldn't be able to filter by "successful" requests?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uhhh, I don't think I understand your question.

The idea is that synapse_outgoing_replication_requests tracks the number of requests, broken down by endpoint (cls.NAME) and response code. For non-200 response codes, the counter is bumped at line 222. For successful responses, there's a different code path, so we bump the counter here.

We've got to put something in that label, so we can't just leave out the 200.

return result

return send_request
Expand Down