Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sampler API, use in SDK tracer #225

Merged
merged 25 commits into from
Oct 24, 2019
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions opentelemetry-api/src/opentelemetry/trace/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ class TraceOptions(int):
def get_default(cls) -> "TraceOptions":
return cls(cls.DEFAULT)

@property
def recorded(self) -> bool:
c24t marked this conversation as resolved.
Show resolved Hide resolved
return bool(self & TraceOptions.RECORDED)


DEFAULT_TRACE_OPTIONS = TraceOptions.get_default()

Expand Down Expand Up @@ -307,8 +311,8 @@ class SpanContext:
Args:
trace_id: The ID of the trace that this span belongs to.
span_id: This span's ID.
options: Trace options to propagate.
state: Tracing-system-specific info to propagate.
trace_options: Trace options to propagate.
trace_state: Tracing-system-specific info to propagate.
"""

def __init__(
Expand Down Expand Up @@ -361,6 +365,9 @@ def __init__(self, context: "SpanContext") -> None:
def get_context(self) -> "SpanContext":
return self._context

def is_recording_events(self) -> bool:
c24t marked this conversation as resolved.
Show resolved Hide resolved
return False


INVALID_SPAN_ID = 0x0000000000000000
INVALID_TRACE_ID = 0x00000000000000000000000000000000
Expand Down
128 changes: 128 additions & 0 deletions opentelemetry-api/src/opentelemetry/trace/sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Copyright 2019, OpenTelemetry Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import abc
from typing import Dict, Mapping, Optional, Sequence

# pylint: disable=unused-import
from opentelemetry.trace import Link, SpanContext
from opentelemetry.util.types import AttributeValue


class Decision:
"""A sampling decision as applied to a newly-created Span.

Args:
sampled: Whether the `Span` should be sampled.
attributes: Attributes to add to the `Span`.
"""

def __repr__(self) -> str:
return "{}({}, attributes={})".format(
type(self).__name__, str(self.sampled), str(self.attributes)
)

def __init__(
self,
sampled: bool = False,
attributes: Mapping[str, "AttributeValue"] = None,
) -> None:
self.sampled: bool
self.attributes: Dict[str, "AttributeValue"]

self.sampled = sampled
if attributes is None:
self.attributes = {}
else:
self.attributes = dict(attributes)


class Sampler(abc.ABC):
@abc.abstractmethod
def should_sample(
self,
parent_context: Optional["SpanContext"],
trace_id: int,
span_id: int,
name: str,
links: Optional[Sequence["Link"]] = None,
c24t marked this conversation as resolved.
Show resolved Hide resolved
) -> "Decision":
pass


class StaticSampler(Sampler):
"""Sampler that always returns the same decision."""

def __init__(self, decision: "Decision"):
self.decision = decision
c24t marked this conversation as resolved.
Show resolved Hide resolved

def should_sample(
self,
parent_context: Optional["SpanContext"],
trace_id: int,
span_id: int,
name: str,
links: Optional[Sequence["Link"]] = None,
) -> "Decision":
return self.decision


class ProbabilitySampler(Sampler):
def __init__(self, rate: float):
self._rate = rate
self._bound = self.get_bound_for_rate(self._rate)

# The sampler checks the last 8 bytes of the trace ID to decide whether to
# sample a given trace.
CHECK_BYTES = 0xFFFFFFFFFFFFFFFF
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the W3 spec, you should use the high ("left") 8 bytes instead:

Many unique identification generation algorithms create IDs where one part of the value is constant (often time- or host-based), and the other part is a randomly generated value. Because tracing systems may make sampling decisions based on the value of trace-id, for increased interoperability vendors MUST keep the random part of trace-id ID on the left side.

But I wonder if we can find a more robust way to maybe randomly mix some bits here and there together.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered this but decided to stick to the OC behavior (or at least the intended behavior: this also fixes a rounding/OBO bug). FWIW checking the high bytes also seems more correct to me, but in practice -- if people are using short trace IDs in the wild -- sampling every request seems worse than sampling based on the non-random part.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right about this. I suggested it as a change to the spec in open-telemetry/opentelemetry-specification#331.

Another fun benefit of checking bytes high-to-low is that the sampling decision should be mostly consistent between samplers that check different numbers of bytes. Unlike checking low-to-high where every incremental bit is effectively another coin toss. Ultimately we'll probably just put this number in the spec, but it's a neat property in any case. :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's revisit this once open-telemetry/opentelemetry-specification#331 and w3c/trace-context#344 are resolved.


@classmethod
def get_bound_for_rate(cls, rate: float) -> int:
return round(rate * (cls.CHECK_BYTES + 1))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If rate is less than 0.0000000000000000000271, bound will be rounded to 0 which means it never gets sampled.
Do we consider to round up or this is just too pedantic?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually it is 0.0000000000000000000813 since bound is rounded to 1 (and all-zero trace id is illegal according to the TraceContext spec). 😄

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used round intentionally here for what I thought were pedantic reasons: this way each of the 2^64 sampling rate "buckets" are split exactly in half:

            r <= 2^-65     : 0
    2^-65 < r <= 3 * 2^-65 : 1
3 * 2^-65 < r <= 5 * 2^-65 : 2

and etc.

I think if you pass in a sampling rate <= 2^-65 it's more correct for us to make the actual rate 0 than 2^-64.

This doesn't exactly work because float precision is worse than 2^-64 for values close to 1, but it works near 0 and is at least theoretically correct (or should be, there may still be bugs here).

all-zero trace id is illegal according to the TraceContext spec.

Oh that's interesting, so the trace ID space is [0x1, 0xf...f], not [0x0, 0xf...f]. So would you expect this line to be:

return 1 + round(rate * cls.CHECK_BYTES)

I think that'd give us the same behavior as above, but always sample trace ID 0x0.

Should we have special handling for 0x0 trace IDs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also happy to leave this as-is and treat 0x0 as a regular ID here, but treat it differently at span creation time. Up to you. :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have any preference here as I cannot think of a scenario where people would need such low but non-zero sample rate (unless someone is instrumenting SETI@home). 😆

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something I missed above: there are and 2^64 - 1 trace IDs besides 0x0 that end in 0x0000000000000000. If we want to always/never sample the all-zero trace ID we have to make that decision before masking.


@property
def rate(self) -> float:
return self._rate

@rate.setter
def rate(self, new_rate: float) -> None:
self._rate = new_rate
self._bound = self.get_bound_for_rate(self._rate)

@property
def bound(self) -> int:
return self._bound

def should_sample(
self,
parent_context: Optional["SpanContext"],
trace_id: int,
span_id: int,
name: str,
links: Optional[Sequence["Link"]] = None,
) -> "Decision":
if parent_context is not None:
return Decision(parent_context.trace_options.recorded, {})
c24t marked this conversation as resolved.
Show resolved Hide resolved

return Decision(trace_id & self.CHECK_BYTES < self.bound, {})
Copy link
Member

@reyang reyang Oct 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be a problem for systems that generate shorter trace_id and use zero-padding / const part, especially under low sample rate case.

https://www.w3.org/TR/trace-context/#trace-id

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think we should check fewer bytes? Or make it configurable?

This follows the OC behavior, I actually don't know why we only check the lower 8.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't dug into this.

If we're at the same bar of OpenCensus, we probably should be okay.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If everyone follows the W3C spec here, we should be covered:

When a system operates with a trace-id that is shorter than 16 bytes, it SHOULD fill-in the extra bytes with random values rather than zeroes.

https://www.w3.org/TR/trace-context/#trace-id

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as they follow the SHOULDs, which they MAY not. :P

c24t marked this conversation as resolved.
Show resolved Hide resolved


# Samplers that ignore the parent sampling decision and never/always sample.
ALWAYS_OFF = StaticSampler(Decision(False))
ALWAYS_ON = StaticSampler(Decision(True))

# Samplers that respect the parent sampling decision, but otherwise
# never/always sample.
DEFAULT_OFF = ProbabilitySampler(0.0)
DEFAULT_ON = ProbabilitySampler(1.0)
223 changes: 223 additions & 0 deletions opentelemetry-api/tests/trace/test_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
# Copyright 2019, OpenTelemetry Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from opentelemetry import trace
from opentelemetry.trace import sampling

TO_DEFAULT = trace.TraceOptions(trace.TraceOptions.DEFAULT)
TO_RECORDED = trace.TraceOptions(trace.TraceOptions.RECORDED)


class TestSampler(unittest.TestCase):
def test_always_on(self):
no_record_always_on = sampling.ALWAYS_ON.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_DEFAULT
),
0xDEADBEF1,
0xDEADBEF2,
"unrecorded parent, sampling on",
)
self.assertTrue(no_record_always_on.sampled)
self.assertEqual(no_record_always_on.attributes, {})

recorded_always_on = sampling.ALWAYS_ON.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_RECORDED
),
0xDEADBEF1,
0xDEADBEF2,
"recorded parent, sampling on",
)
self.assertTrue(recorded_always_on.sampled)
self.assertEqual(recorded_always_on.attributes, {})

def test_always_off(self):
no_record_always_off = sampling.ALWAYS_OFF.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_DEFAULT
),
0xDEADBEF1,
0xDEADBEF2,
"unrecorded parent, sampling off",
)
self.assertFalse(no_record_always_off.sampled)
self.assertEqual(no_record_always_off.attributes, {})

recorded_always_on = sampling.ALWAYS_OFF.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_RECORDED
),
0xDEADBEF1,
0xDEADBEF2,
"recorded parent, sampling off",
)
self.assertFalse(recorded_always_on.sampled)
self.assertEqual(recorded_always_on.attributes, {})

def test_default_on(self):
no_record_default_on = sampling.DEFAULT_ON.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_DEFAULT
),
0xDEADBEF1,
0xDEADBEF2,
"unrecorded parent, sampling on",
)
self.assertFalse(no_record_default_on.sampled)
self.assertEqual(no_record_default_on.attributes, {})

recorded_default_on = sampling.DEFAULT_ON.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_RECORDED
),
0xDEADBEF1,
0xDEADBEF2,
"recorded parent, sampling on",
)
self.assertTrue(recorded_default_on.sampled)
self.assertEqual(recorded_default_on.attributes, {})

def test_default_off(self):
no_record_default_off = sampling.DEFAULT_OFF.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_DEFAULT
),
0xDEADBEF1,
0xDEADBEF2,
"unrecorded parent, sampling off",
)
self.assertFalse(no_record_default_off.sampled)
self.assertEqual(no_record_default_off.attributes, {})

recorded_default_off = sampling.DEFAULT_OFF.should_sample(
trace.SpanContext(
0xDEADBEEF, 0xDEADBEF0, trace_options=TO_RECORDED
),
0xDEADBEF1,
0xDEADBEF2,
"recorded parent, sampling off",
)
self.assertTrue(recorded_default_off.sampled)
self.assertEqual(recorded_default_off.attributes, {})

def test_probability_sampler(self):
sampler = sampling.ProbabilitySampler(0.5)

# Check that we sample based on the trace ID if the parent context is
# null
self.assertTrue(
sampler.should_sample(
None, 0x7FFFFFFFFFFFFFFF, 0xDEADBEEF, "span name"
).sampled
)
self.assertFalse(
sampler.should_sample(
None, 0x8000000000000000, 0xDEADBEEF, "span name"
).sampled
)

# Check that the sampling decision matches the parent context if given,
# and that the sampler ignores the trace ID
self.assertFalse(
sampler.should_sample(
trace.SpanContext(
0xDEADBEF0, 0xDEADBEF1, trace_options=TO_DEFAULT
),
0x8000000000000000,
0xDEADBEEF,
"span name",
).sampled
)
self.assertTrue(
sampler.should_sample(
trace.SpanContext(
0xDEADBEF0, 0xDEADBEF1, trace_options=TO_RECORDED
),
0x8000000000000001,
0xDEADBEEF,
"span name",
).sampled
)

def test_probability_sampler_zero(self):
default_off = sampling.ProbabilitySampler(0.0)
self.assertFalse(
default_off.should_sample(
None, 0x0, 0xDEADBEEF, "span name"
).sampled
)

def test_probability_sampler_one(self):
default_off = sampling.ProbabilitySampler(1.0)
self.assertTrue(
default_off.should_sample(
None, 0xFFFFFFFFFFFFFFFF, 0xDEADBEEF, "span name"
).sampled
)

def test_probability_sampler_limits(self):

# Sample one of every 2^64 (= 5e-20) traces. This is the lowest
# possible meaningful sampling rate, only traces with trace ID 0x0
# should get sampled.
almost_always_off = sampling.ProbabilitySampler(1 / 2 ** 64)
c24t marked this conversation as resolved.
Show resolved Hide resolved
self.assertTrue(
almost_always_off.should_sample(
None, 0x0, 0xDEADBEEF, "span name"
).sampled
)
self.assertFalse(
almost_always_off.should_sample(
None, 0x1, 0xDEADBEEF, "span name"
).sampled
)
self.assertEqual(
sampling.ProbabilitySampler.get_bound_for_rate(1 / 2 ** 64), 0x1
)

# Sample every trace with (last 8 bytes of) trace ID less than
# 0xffffffffffffffff. In principle this is the highest possible
# sampling rate less than 1, but we can't actually express this rate as
# a float!
#
# In practice, the highest possible sampling rate is:
#
# round(sys.float_info.epsilon * 2 ** 64)

almost_always_on = sampling.ProbabilitySampler(1 - (1 / 2 ** 64))
self.assertTrue(
almost_always_on.should_sample(
None, 0xFFFFFFFFFFFFFFFE, 0xDEADBEEF, "span name"
).sampled
)

# These tests are logically consistent, but fail because of the float
# precision issue above. Changing the sampler to check fewer bytes of
# the trace ID will cause these to pass.

# self.assertFalse(
# almost_always_on.should_sample(
# None,
# 0xffffffffffffffff,
# 0xdeadbeef,
# "span name",
# ).sampled
# )
# self.assertEqual(
# sampling.ProbabilitySampler.get_bound_for_rate(1 - (1 / 2 ** 64)),
# 0xffffffffffffffff,
# )
Loading