Skip to content

Commit

Permalink
fix(tracing): only apply rate limits when trace sample rules are set …
Browse files Browse the repository at this point in the history
…[except for ASM Standalone] [backport 2.14] (#10941)

Backport afb6f2f from #10764 to 2.14.

Aligns the implementation of DD_TRACE_RATE_LIMIT with the java, dotnet
and nodejs tracers:
https://docs.datadoghq.com/tracing/trace_collection/library_config/java/#configuration-options.

Linked to: https://github.com/DataDog/dd-trace-py/pull/10830/files. 

## Changes
- When tracing is enabled only apply DD_TRACE_RATE_LIMIT if a trace
sampling rules or a trace sample rate is set. Otherwise trace rate
limiting should be handled by the Datadog Agent.
- [backwards compatibility] When ASM is enabled in standalone mode, rate
limiting of 1 trace per second should be applied to ALL spans. All rate
limited spans should have priority category of USER.
- When spans are rate limited set `_dd.limit_psr` to the effective rate
limit. Previously this tag was only set is DD_TRACE_RATE_LIMIT != 100.
This restriction is not found in the spec.


## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met 
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

Co-authored-by: Munir Abdinur <munir.abdinur@datadoghq.com>
  • Loading branch information
github-actions[bot] and mabdinur authored Oct 4, 2024
1 parent 7fcc3a0 commit 70a2ee1
Show file tree
Hide file tree
Showing 14 changed files with 92 additions and 46 deletions.
47 changes: 30 additions & 17 deletions ddtrace/_trace/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,18 @@ def __init__(
self._user_sampler: Optional[BaseSampler] = DatadogSampler()
self._asm_enabled = asm_config._asm_enabled
self._appsec_standalone_enabled = asm_config._appsec_standalone_enabled
self._sampler: BaseSampler = DatadogSampler()
self._maybe_opt_out()
self._dogstatsd_url = agent.get_stats_url() if dogstatsd_url is None else dogstatsd_url
self._apm_opt_out = self._asm_enabled and self._appsec_standalone_enabled
if self._apm_opt_out:
self.enabled = False
# Disable compute stats (neither agent or tracer should compute them)
config._trace_compute_stats = False
# If ASM is enabled but tracing is disabled,
# we need to set the rate limiting to 1 trace per minute
# for the backend to consider the service as alive.
self._sampler: BaseSampler = DatadogSampler(rate_limit=1, rate_limit_window=60e9, rate_limit_always_on=True)
else:
self._sampler: BaseSampler = DatadogSampler()
self._compute_stats = config._trace_compute_stats
self._agent_url: str = agent.get_trace_url() if url is None else url
verify_url(self._agent_url)
Expand Down Expand Up @@ -295,19 +304,6 @@ def __init__(
config._subscribe(["tags"], self._on_global_config_update)
config._subscribe(["_tracing_enabled"], self._on_global_config_update)

def _maybe_opt_out(self):
self._apm_opt_out = self._asm_enabled and self._appsec_standalone_enabled
if self._apm_opt_out:
# If ASM is enabled but tracing is disabled,
# we need to set the rate limiting to 1 trace per minute
# for the backend to consider the service as alive.
from ddtrace.internal.rate_limiter import RateLimiter

self._sampler.limiter = RateLimiter(rate_limit=1, time_window=60e9) # 1 trace per minute
# Disable compute stats (neither agent or tracer should compute them)
config._trace_compute_stats = False
self.enabled = False

def _atexit(self) -> None:
key = "ctrl-break" if os.name == "nt" else "ctrl-c"
log.debug(
Expand Down Expand Up @@ -502,12 +498,29 @@ def configure(
if appsec_standalone_enabled is not None:
self._appsec_standalone_enabled = asm_config._appsec_standalone_enabled = appsec_standalone_enabled

if self._appsec_standalone_enabled and self._asm_enabled:
self._apm_opt_out = True
self.enabled = False
# Disable compute stats (neither agent or tracer should compute them)
config._trace_compute_stats = False
# Update the rate limiter to 1 trace per minute when tracing is disabled
if isinstance(sampler, DatadogSampler):
sampler._rate_limit_always_on = True
sampler.limiter.rate_limit = 1
sampler.limiter.time_window = 60e9
else:
if sampler is not None:
log.warning(
"Overriding sampler: %s, a DatadogSampler must be used in ASM Standalone mode",
sampler.__class__,
)
sampler = DatadogSampler(rate_limit=1, rate_limit_window=60e9, rate_limit_always_on=True)
log.debug("ASM standalone mode is enabled, traces will be rate limited at 1 trace per minute")

if sampler is not None:
self._sampler = sampler
self._user_sampler = self._sampler

self._maybe_opt_out()

self._dogstatsd_url = dogstatsd_url or self._dogstatsd_url

if any(x is not None for x in [hostname, port, uds_path, https]):
Expand Down
5 changes: 0 additions & 5 deletions ddtrace/internal/rate_limiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from ddtrace.vendor.debtcollector import deprecate

from ..internal import compat
from ..internal.constants import DEFAULT_SAMPLING_RATE_LIMIT


class RateLimiter(object):
Expand Down Expand Up @@ -59,10 +58,6 @@ def __init__(self, rate_limit: int, time_window: float = 1e9):

self._lock = threading.Lock()

@property
def _has_been_configured(self):
return self.rate_limit != DEFAULT_SAMPLING_RATE_LIMIT

def is_allowed(self, timestamp_ns: Optional[int] = None) -> bool:
"""
Check whether the current request is allowed or not
Expand Down
39 changes: 23 additions & 16 deletions ddtrace/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class DatadogSampler(RateByServiceSampler):
per second.
"""

__slots__ = ("limiter", "rules", "default_sample_rate")
__slots__ = ("limiter", "rules", "default_sample_rate", "_rate_limit_always_on")

NO_RATE_LIMIT = -1
# deprecate and remove the DEFAULT_RATE_LIMIT field from DatadogSampler
Expand All @@ -214,6 +214,8 @@ def __init__(
rules=None, # type: Optional[List[SamplingRule]]
default_sample_rate=None, # type: Optional[float]
rate_limit=None, # type: Optional[int]
rate_limit_window=1e9, # type: float
rate_limit_always_on=False, # type: bool
):
# type: (...) -> None
"""
Expand All @@ -228,13 +230,16 @@ def __init__(
# Use default sample rate of 1.0
super(DatadogSampler, self).__init__()
self.default_sample_rate = default_sample_rate
effective_sample_rate = default_sample_rate
if default_sample_rate is None:
if ddconfig._get_source("_trace_sample_rate") != "default":
default_sample_rate = float(ddconfig._trace_sample_rate)
effective_sample_rate = float(ddconfig._trace_sample_rate)

if rate_limit is None:
rate_limit = int(ddconfig._trace_rate_limit)

self._rate_limit_always_on = rate_limit_always_on

if rules is None:
env_sampling_rules = ddconfig._trace_sampling_rules
if env_sampling_rules:
Expand All @@ -251,12 +256,12 @@ def __init__(
elif config._raise:
raise TypeError("Rule {!r} must be a sub-class of type ddtrace.sampler.SamplingRules".format(rule))

# DEV: Default sampling rule must come last
if default_sample_rate is not None:
self.rules.append(SamplingRule(sample_rate=default_sample_rate))
# DEV: sampling rule must come last
if effective_sample_rate is not None:
self.rules.append(SamplingRule(sample_rate=effective_sample_rate))

# Configure rate limiter
self.limiter = RateLimiter(rate_limit)
self.limiter = RateLimiter(rate_limit, rate_limit_window)

log.debug("initialized %r", self)

Expand Down Expand Up @@ -315,18 +320,22 @@ def sample(self, span):
sampler = self._default_sampler # type: BaseSampler
sample_rate = self.sample_rate
if matched_rule:
# Client based sampling
sampled = matched_rule.sample(span)
sample_rate = matched_rule.sample_rate
else:
# Agent based sampling
sampled, sampler = super(DatadogSampler, self)._make_sampling_decision(span)
if isinstance(sampler, RateSampler):
sample_rate = sampler.sample_rate
# Apply rate limit
if sampled:
sampled = self.limiter.is_allowed()
if self.limiter._has_been_configured:
span.set_metric(SAMPLING_LIMIT_DECISION, self.limiter.effective_rate)

if matched_rule or self._rate_limit_always_on:
# Avoid rate limiting when trace sample rules and/or sample rates are NOT provided
# by users. In this scenario tracing should default to agent based sampling. ASM
# uses DatadogSampler._rate_limit_always_on to override this functionality.
if sampled:
sampled = self.limiter.is_allowed()
span.set_metric(SAMPLING_LIMIT_DECISION, self.limiter.effective_rate)
_set_sampling_tags(
span,
sampled,
Expand All @@ -345,10 +354,8 @@ def _choose_priority_category_with_rule(self, rule, sampler):
if provenance == "dynamic":
return _PRIORITY_CATEGORY.RULE_DYNAMIC
return _PRIORITY_CATEGORY.RULE_DEF

if self.limiter._has_been_configured:
# If the default rate limiter is NOT used to sample traces
# the sampling priority must be set to manual keep/drop.
# This will disable agent based sample rates.
elif self._rate_limit_always_on:
# backwards compaitbiility for ASM, when the rate limit is always on (ASM standalone mode)
# we want spans to be set to a MANUAL priority to avoid agent based sampling
return _PRIORITY_CATEGORY.USER
return super(DatadogSampler, self)._choose_priority_category(sampler)
13 changes: 12 additions & 1 deletion ddtrace/settings/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,18 @@ def __init__(self):
self._debug_mode = asbool(os.getenv("DD_TRACE_DEBUG", default=False))
self._startup_logs_enabled = asbool(os.getenv("DD_TRACE_STARTUP_LOGS", False))

self._trace_rate_limit = int(os.getenv("DD_TRACE_RATE_LIMIT", default=DEFAULT_SAMPLING_RATE_LIMIT))
rate_limit = os.getenv("DD_TRACE_RATE_LIMIT")
if rate_limit is not None and self._trace_sampling_rules in ("", "[]"):
# This warning will be logged when DD_TRACE_SAMPLE_RATE is set. This is intentional.
# Even though DD_TRACE_SAMPLE_RATE is treated as a global trace sampling rule, this configuration
# is deprecated. We should always encourage users to set DD_TRACE_SAMPLING_RULES instead.
log.warning(
"DD_TRACE_RATE_LIMIT is set to %s and DD_TRACE_SAMPLING_RULES is not set. "
"Tracer rate limitting is only applied to spans that match tracer sampling rules. "
"All other spans will be rate limited by the Datadog Agent via DD_APM_MAX_TPS.",
rate_limit,
)
self._trace_rate_limit = int(rate_limit or DEFAULT_SAMPLING_RATE_LIMIT)
self._partial_flush_enabled = asbool(os.getenv("DD_TRACE_PARTIAL_FLUSH_ENABLED", default=True))
self._partial_flush_min_spans = int(os.getenv("DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", default=300))

Expand Down
5 changes: 3 additions & 2 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,11 @@ The following environment variables for the tracer are supported:
type: int
default: 100
description: |
Maximum number of traces per second to sample. Set a rate limit to avoid the ingestion volume overages in the case of traffic spikes.

Maximum number of traces per second to sample. Set a rate limit to avoid the ingestion volume overages in the case of traffic spikes. This configuration
is only applied when client based sampling is configured, otherwise agent based rate limits are used.
version_added:
v0.33.0:
v2.15.0: Only applied when DD_TRACE_SAMPLE_RATE, DD_TRACE_SAMPLING_RULES, or DD_SPAN_SAMPLING_RULE are set.

DD_TRACE_SAMPLING_RULES:
type: JSON array
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
tracing: Ensures ``DD_TRACE_RATE_LIMIT`` environment variable is only applied to spans for which tracer sampling is configured. For spans not matching sampling rules default rate limits should be applied by the Datadog Agent.
3 changes: 2 additions & 1 deletion tests/integration/test_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ def test_rate_limiter_on_spans(tracer):
"""
Ensure that the rate limiter is applied to spans
"""
tracer.configure(sampler=DatadogSampler(rate_limit=10))
# Rate limit is only applied if a sample rate or trace sample rule is set
tracer.configure(sampler=DatadogSampler(default_sample_rate=1, rate_limit=10))
spans = []
# Generate 10 spans with the start and finish time in same second
for x in range(10):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"test": "tag"
},
"metrics": {
"_dd.limit_psr": 1.0,
"_dd.rule_psr": 1.0,
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"runtime-id": "10b45fb36ca3465fa2b6dbd7b10bdccf"
},
"metrics": {
"_dd.limit_psr": 1.0,
"_dd.rule_psr": 1.0,
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"runtime-id": "10b45fb36ca3465fa2b6dbd7b10bdccf"
},
"metrics": {
"_dd.limit_psr": 1.0,
"_dd.rule_psr": 1.0,
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
Expand Down
5 changes: 2 additions & 3 deletions tests/snapshots/test_sampling_with_rate_limit_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
"type": "",
"error": 0,
"meta": {
"_dd.p.dm": "-3",
"_dd.p.dm": "-0",
"language": "python",
"runtime-id": "9d1cfdf7026e45dea46fc4c83768cc7b"
},
"metrics": {
"_dd.limit_psr": 1.0,
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 2,
"_sampling_priority_v1": 1,
"process_id": 9723
},
"duration": 71105,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"runtime-id": "9d1cfdf7026e45dea46fc4c83768cc7b"
},
"metrics": {
"_dd.limit_psr": 1,
"_dd.rule_psr": 0,
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"runtime-id": "423178dbb29d4ca784743ac73e5678c8"
},
"metrics": {
"_dd.limit_psr": 1.0,
"_dd.rule_psr": 1.0,
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
Expand Down
12 changes: 12 additions & 0 deletions tests/tracer/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,18 @@ def test_sampling_rule_sample_rate_0():
), "SamplingRule with rate=0 should never keep samples"


@pytest.mark.subprocess(
env={"DD_TRACE_RATE_LIMIT": "2", "DD_TRACE_SAMPLING_RULES": ""},
err=b"DD_TRACE_RATE_LIMIT is set to 2 and DD_TRACE_SAMPLING_RULES is not set. "
b"Tracer rate limitting is only applied to spans that match tracer sampling rules. "
b"All other spans will be rate limited by the Datadog Agent via DD_APM_MAX_TPS.\n",
)
def test_rate_limit_without_sampling_rules_warning():
from ddtrace import config

assert config._trace_rate_limit == 2


def test_datadog_sampler_init():
sampler = DatadogSampler()
assert sampler.rules == [], "DatadogSampler initialized with no arguments should hold no rules"
Expand Down

0 comments on commit 70a2ee1

Please sign in to comment.