Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Retry well known on fail #5850

Merged
merged 4 commits into from
Aug 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/5850.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add retry to well-known lookups if we have recently seen a valid well-known record for the server.
26 changes: 14 additions & 12 deletions synapse/http/federation/matrix_federation_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,17 @@ class MatrixFederationAgent(object):
SRVResolver impl to use for looking up SRV records. None to use a default
implementation.

_well_known_cache (TTLCache|None):
TTLCache impl for storing cached well-known lookups. None to use a default
implementation.
_well_known_resolver (WellKnownResolver|None):
WellKnownResolver to use to perform well-known lookups. None to use a
default implementation.
"""

def __init__(
self,
reactor,
tls_client_options_factory,
_srv_resolver=None,
_well_known_cache=None,
_well_known_resolver=None,
):
self._reactor = reactor
self._clock = Clock(reactor)
Expand All @@ -76,15 +76,17 @@ def __init__(
self._pool.maxPersistentPerHost = 5
self._pool.cachedConnectionTimeout = 2 * 60

self._well_known_resolver = WellKnownResolver(
self._reactor,
agent=Agent(
if _well_known_resolver is None:
_well_known_resolver = WellKnownResolver(
self._reactor,
pool=self._pool,
contextFactory=tls_client_options_factory,
),
well_known_cache=_well_known_cache,
)
agent=Agent(
self._reactor,
pool=self._pool,
contextFactory=tls_client_options_factory,
),
)

self._well_known_resolver = _well_known_resolver

@defer.inlineCallbacks
def request(self, method, uri, headers=None, bodyProducer=None):
Expand Down
126 changes: 101 additions & 25 deletions synapse/http/federation/well_known_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,19 @@
# period to cache .well-known results for by default
WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600

# jitter to add to the .well-known default cache ttl
WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 10 * 60
# jitter factor to add to the .well-known default cache ttls
WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 0.1

# period to cache failure to fetch .well-known for
WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600

# period to cache failure to fetch .well-known if there has recently been a
# valid well-known for that domain.
WELL_KNOWN_DOWN_CACHE_PERIOD = 2 * 60

# period to remember there was a valid well-known after valid record expires
WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID = 2 * 3600

# cap for .well-known cache period
WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600

Expand All @@ -49,11 +56,16 @@
# we'll start trying to refetch 1 minute before it expires.
WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2

# Number of times we retry fetching a well-known for a domain we know recently
# had a valid entry.
WELL_KNOWN_RETRY_ATTEMPTS = 3


logger = logging.getLogger(__name__)


_well_known_cache = TTLCache("well-known")
_had_valid_well_known_cache = TTLCache("had-valid-well-known")


@attr.s(slots=True, frozen=True)
Expand All @@ -65,14 +77,20 @@ class WellKnownResolver(object):
"""Handles well-known lookups for matrix servers.
"""

def __init__(self, reactor, agent, well_known_cache=None):
def __init__(
self, reactor, agent, well_known_cache=None, had_well_known_cache=None
):
self._reactor = reactor
self._clock = Clock(reactor)

if well_known_cache is None:
well_known_cache = _well_known_cache

if had_well_known_cache is None:
had_well_known_cache = _had_valid_well_known_cache

self._well_known_cache = well_known_cache
self._had_valid_well_known_cache = had_well_known_cache
self._well_known_agent = RedirectAgent(agent)

@defer.inlineCallbacks
Expand Down Expand Up @@ -100,7 +118,7 @@ def get_well_known(self, server_name):
# requests for the same server in parallel?
try:
with Measure(self._clock, "get_well_known"):
result, cache_period = yield self._do_get_well_known(server_name)
result, cache_period = yield self._fetch_well_known(server_name)

except _FetchWellKnownFailure as e:
if prev_result and e.temporary:
Expand All @@ -111,18 +129,26 @@ def get_well_known(self, server_name):

result = None

# add some randomness to the TTL to avoid a stampeding herd every hour
# after startup
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
if self._had_valid_well_known_cache.get(server_name, False):
# We have recently seen a valid well-known record for this
# server, so we cache the lack of well-known for a shorter time.
cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD
else:
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD

# add some randomness to the TTL to avoid a stampeding herd
cache_period *= random.uniform(
1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
)

if cache_period > 0:
self._well_known_cache.set(server_name, result, cache_period)

return WellKnownLookupResult(delegated_server=result)

@defer.inlineCallbacks
def _do_get_well_known(self, server_name):
def _fetch_well_known(self, server_name):
"""Actually fetch and parse a .well-known, without checking the cache

Args:
Expand All @@ -134,24 +160,15 @@ def _do_get_well_known(self, server_name):
Returns:
Deferred[Tuple[bytes,int]]: The lookup result and cache period.
"""
uri = b"https://%s/.well-known/matrix/server" % (server_name,)
uri_str = uri.decode("ascii")
logger.info("Fetching %s", uri_str)

had_valid_well_known = self._had_valid_well_known_cache.get(server_name, False)

# We do this in two steps to differentiate between possibly transient
# errors (e.g. can't connect to host, 503 response) and more permenant
# errors (such as getting a 404 response).
try:
response = yield make_deferred_yieldable(
self._well_known_agent.request(b"GET", uri)
)
body = yield make_deferred_yieldable(readBody(response))

if 500 <= response.code < 600:
raise Exception("Non-200 response %s" % (response.code,))
except Exception as e:
logger.info("Error fetching %s: %s", uri_str, e)
raise _FetchWellKnownFailure(temporary=True)
response, body = yield self._make_well_known_request(
server_name, retry=had_valid_well_known
)

try:
if response.code != 200:
Expand All @@ -161,8 +178,11 @@ def _do_get_well_known(self, server_name):
logger.info("Response from .well-known: %s", parsed_body)

result = parsed_body["m.server"].encode("ascii")
except defer.CancelledError:
# Bail if we've been cancelled
raise
except Exception as e:
logger.info("Error fetching %s: %s", uri_str, e)
logger.info("Error parsing well-known for %s: %s", server_name, e)
raise _FetchWellKnownFailure(temporary=False)

cache_period = _cache_period_from_headers(
Expand All @@ -172,13 +192,69 @@ def _do_get_well_known(self, server_name):
cache_period = WELL_KNOWN_DEFAULT_CACHE_PERIOD
# add some randomness to the TTL to avoid a stampeding herd every 24 hours
# after startup
cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
cache_period *= random.uniform(
1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
)
else:
cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD)
cache_period = max(cache_period, WELL_KNOWN_MIN_CACHE_PERIOD)

# We got a success, mark as such in the cache
self._had_valid_well_known_cache.set(
server_name,
bool(result),
cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID,
)

return (result, cache_period)

@defer.inlineCallbacks
def _make_well_known_request(self, server_name, retry):
"""Make the well known request.

This will retry the request if requested and it fails (with unable
to connect or receives a 5xx error).

Args:
server_name (bytes)
retry (bool): Whether to retry the request if it fails.

Returns:
Deferred[tuple[IResponse, bytes]] Returns the response object and
body. Response may be a non-200 response.
"""
uri = b"https://%s/.well-known/matrix/server" % (server_name,)
uri_str = uri.decode("ascii")

i = 0
while True:
i += 1

logger.info("Fetching %s", uri_str)
try:
response = yield make_deferred_yieldable(
self._well_known_agent.request(b"GET", uri)
)
body = yield make_deferred_yieldable(readBody(response))

if 500 <= response.code < 600:
raise Exception("Non-200 response %s" % (response.code,))

return response, body
except defer.CancelledError:
# Bail if we've been cancelled
raise
except Exception as e:
if not retry or i >= WELL_KNOWN_RETRY_ATTEMPTS:
logger.info("Error fetching %s: %s", uri_str, e)
raise _FetchWellKnownFailure(temporary=True)

logger.info("Error fetching %s: %s. Retrying", uri_str, e)

# Sleep briefly in the hopes that they come back up
yield self._clock.sleep(0.5)


def _cache_period_from_headers(headers, time_now=time.time):
cache_controls = _parse_cache_control(headers)
Expand Down
Loading