Skip to content

Commit

Permalink
Add 'meta.refinery.original_sample_rate' (honeycombio#508)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?

Refinery re-writing the sample rate was causing some concern. Writing down the sample rate sent to refinery prior to updating the sample rate should allow for more/better debug.

This is implementing the suggestion of @robbkidd in honeycombio#490 to add annotations to the spans of their original sample rate. This will allow for better understanding of what is happening when there are multiple tiers of sampling happening in the telemetry processing.

## Short description of the changes
At the places where the Span SampleRate was combined with the Trace SampleRate, the original span SampleRate is copied down into a meta.refinery annotation.
  • Loading branch information
epvanhouten authored Sep 9, 2022
1 parent eddd9dc commit 3bd19ea
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 10 deletions.
35 changes: 25 additions & 10 deletions collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,16 +395,35 @@ func (i *InMemCollector) dealWithSentTrace(keep bool, sampleRate uint, sp *types
}
if keep {
i.Logger.Debug().WithField("trace_id", sp.TraceID).Logf("Sending span because of previous decision to send trace")
if sp.SampleRate < 1 {
sp.SampleRate = 1
}
sp.SampleRate *= sampleRate
mergeTraceAndSpanSampleRates(sp, sampleRate)
i.Transmission.EnqueueSpan(sp)
return
}
i.Logger.Debug().WithField("trace_id", sp.TraceID).Logf("Dropping span because of previous decision to drop trace")
}

func mergeTraceAndSpanSampleRates(sp *types.Span, traceSampleRate uint) {
if traceSampleRate != 1 {
// When the sample rate from the trace is not 1 that means we are
// going to mangle the span sample rate. Write down the original sample
// rate so that that information is more easily recovered
sp.Data["meta.refinery.original_sample_rate"] = sp.SampleRate
}

if sp.SampleRate < 1 {
// See https://docs.honeycomb.io/manage-data-volume/sampling/
// SampleRate is the denominator of the ratio of sampled spans
// HoneyComb treats a missing or 0 SampleRate the same as 1, but
// behaves better/more consistently if the SampleRate is explicitly
// set instead of inferred
sp.SampleRate = 1
}

// if spans are already sampled, take that in to account when computing
// the final rate
sp.SampleRate *= traceSampleRate
}

func isRootSpan(sp *types.Span) bool {
parentID := sp.Data["trace.parent_id"]
if parentID == nil {
Expand Down Expand Up @@ -489,19 +508,15 @@ func (i *InMemCollector) send(trace *types.Trace) {
if i.Config.GetAddRuleReasonToTrace() {
sp.Data["meta.refinery.reason"] = reason
}
if sp.SampleRate < 1 {
sp.SampleRate = 1
}

if i.Config.GetIsDryRun() {
field := i.Config.GetDryRunFieldName()
sp.Data[field] = shouldSend
}
if i.hostname != "" {
sp.Data["meta.refinery.local_hostname"] = i.hostname
}
// if spans are already sampled, take that in to account when computing
// the final rate
sp.SampleRate *= trace.SampleRate
mergeTraceAndSpanSampleRates(sp, trace.SampleRate)
i.Transmission.EnqueueSpan(sp)
}
}
Expand Down
122 changes: 122 additions & 0 deletions collect/collect_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package collect

import (
"fmt"
"runtime"
"strconv"
"testing"
Expand Down Expand Up @@ -98,6 +99,127 @@ func TestAddRootSpan(t *testing.T) {
transmission.Mux.RUnlock()
}

// #490, SampleRate getting stomped could cause confusion if sampling was
// happening upstream of refinery. Writing down what got sent to refinery
// will help people figure out what is going on.
func TestOriginalSampleRateIsNotedInMetaField(t *testing.T) {
transmission := &transmit.MockTransmission{}
transmission.Start()
conf := &config.MockConfig{
GetSendDelayVal: 0,
GetTraceTimeoutVal: 60 * time.Second,
GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 2},
SendTickerVal: 2 * time.Millisecond,
}
coll := &InMemCollector{
Config: conf,
Logger: &logger.NullLogger{},
Transmission: transmission,
Metrics: &metrics.NullMetrics{},
SamplerFactory: &sample.SamplerFactory{
Config: conf,
Logger: &logger.NullLogger{},
},
}

c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{})
coll.cache = c
stc, err := lru.New(15)
assert.NoError(t, err, "lru cache should start")
coll.sentTraceCache = stc

coll.incoming = make(chan *types.Span, 5)
coll.fromPeer = make(chan *types.Span, 5)
coll.datasetSamplers = make(map[string]sample.Sampler)
go coll.collect()
defer coll.Stop()

// Spin until a sample gets triggered
sendAttemptCount := 0
for getEventsLength(transmission) < 1 || sendAttemptCount > 10 {
sendAttemptCount++
span := &types.Span{
TraceID: fmt.Sprintf("trace-%v", sendAttemptCount),
Event: types.Event{
Dataset: "aoeu",
APIKey: legacyAPIKey,
SampleRate: 50,
Data: make(map[string]interface{}),
},
}
coll.AddSpan(span)
time.Sleep(conf.SendTickerVal * 2)
}

transmission.Mux.RLock()
assert.Greater(t, len(transmission.Events), 0, "should be some events transmitted")
assert.Equal(t, uint(50), transmission.Events[0].Data["meta.refinery.original_sample_rate"], "metadata should be populated with original sample rate")
transmission.Mux.RUnlock()
}

// HoneyComb treats a missing or 0 SampleRate the same as 1, but
// behaves better/more consistently if the SampleRate is explicitly
// set instead of inferred
func TestTransmittedSpansShouldHaveASampleRateOfAtLeastOne(t *testing.T) {
transmission := &transmit.MockTransmission{}
transmission.Start()
conf := &config.MockConfig{
GetSendDelayVal: 0,
GetTraceTimeoutVal: 60 * time.Second,
GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1},
SendTickerVal: 2 * time.Millisecond,
}
coll := &InMemCollector{
Config: conf,
Logger: &logger.NullLogger{},
Transmission: transmission,
Metrics: &metrics.NullMetrics{},
SamplerFactory: &sample.SamplerFactory{
Config: conf,
Logger: &logger.NullLogger{},
},
}

c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{})
coll.cache = c
stc, err := lru.New(15)
assert.NoError(t, err, "lru cache should start")
coll.sentTraceCache = stc

coll.incoming = make(chan *types.Span, 5)
coll.fromPeer = make(chan *types.Span, 5)
coll.datasetSamplers = make(map[string]sample.Sampler)
go coll.collect()
defer coll.Stop()

span := &types.Span{
TraceID: fmt.Sprintf("trace-%v", 1),
Event: types.Event{
Dataset: "aoeu",
APIKey: legacyAPIKey,
SampleRate: 0, // This should get lifted to 1
Data: make(map[string]interface{}),
},
}

coll.AddSpan(span)

time.Sleep(conf.SendTickerVal * 2)

transmission.Mux.RLock()
assert.Equal(t, 1, len(transmission.Events), "should be some events transmitted")
assert.Equal(t, uint(1), transmission.Events[0].SampleRate,
"SampleRate should be reset to one after starting at zero")
transmission.Mux.RUnlock()
}

func getEventsLength(transmission *transmit.MockTransmission) int {
transmission.Mux.RLock()
defer transmission.Mux.RUnlock()

return len(transmission.Events)
}

// TestAddSpan tests that adding a span winds up with a trace object in the
// cache
func TestAddSpan(t *testing.T) {
Expand Down

0 comments on commit 3bd19ea

Please sign in to comment.