Skip to content

Commit

Permalink
Log time settings in CI (#6140)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?
- ES v8.0/v2 integration tests often fail
([exampe](https://github.com/jaegertracing/jaeger/actions/runs/11622432732/job/32429220258)),
but looks like maybe only during certain hours of the day. E.g.
yesterday I rerun the workflow 4 times at night and it repeatedly
failed, then in the morning it succeeded.

## Description of the changes
- Log time settings in the runner to verify whether it's a time-zone
related issue with locating the indices
- Add debug logging to ES to indicate which indices are being used for
reading / writing (did not notice any issues with that in the failed run
above)
- Add debug logging to v2 binary configs
- Fix a bug in `integration.go` where `require.NoError` was using inside
lambda functions passed to `waitForCondition` - it was causing tests to
fail on transient error with Elastic. Change it to log the error and
return `false` so that `waitForCondition` may continue looping & waiting
for successful check.

---------

Signed-off-by: Yuri Shkuro <github@ysh.us>
  • Loading branch information
yurishkuro authored Nov 3, 2024
1 parent 9c1c3ce commit 772d84c
Show file tree
Hide file tree
Showing 18 changed files with 141 additions and 25 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/ci-e2e-elasticsearch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ jobs:
with:
go-version: 1.23.x

- name: time settings
run: |
date
echo TZ="$TZ"
- uses: docker/setup-qemu-action@49b3bc8e6bdd4a60e6116a5414239cba5943d3cf # v3.2.0
- name: Run ${{ matrix.version.distribution }} integration tests
id: test-execution
Expand Down
10 changes: 10 additions & 0 deletions cmd/jaeger/config-badger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ service:
receivers: [otlp]
processors: [batch]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: info
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
10 changes: 10 additions & 0 deletions cmd/jaeger/config-cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ service:
receivers: [otlp]
processors: [batch]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
10 changes: 10 additions & 0 deletions cmd/jaeger/config-elasticsearch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ service:
receivers: [otlp]
processors: [batch]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
1 change: 1 addition & 0 deletions cmd/jaeger/config-kafka-collector.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ service:
service.name: jaeger_collector
metrics:
level: detailed
address: 0.0.0.0:8888

extensions:
healthcheckv2:
Expand Down
5 changes: 4 additions & 1 deletion cmd/jaeger/config-kafka-ingester.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ service:
processors: [batch]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
address: 0.0.0.0:8889
level: detailed
address: 0.0.0.0:8888
logs:
level: debug

Expand Down
10 changes: 10 additions & 0 deletions cmd/jaeger/config-opensearch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ service:
receivers: [otlp]
processors: [batch]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
10 changes: 10 additions & 0 deletions cmd/jaeger/config-remote-storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ service:
receivers: [otlp]
processors: [batch]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
9 changes: 8 additions & 1 deletion cmd/jaeger/config-tail-sampling-always-sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@ service:
processors: [tail_sampling]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: DEBUG
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
9 changes: 8 additions & 1 deletion cmd/jaeger/config-tail-sampling-service-name-policy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@ service:
processors: [tail_sampling]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: DEBUG
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
10 changes: 10 additions & 0 deletions cmd/jaeger/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ service:
receivers: [otlp, jaeger, zipkin]
processors: [batch, adaptive_sampling]
exporters: [jaeger_storage_exporter]
telemetry:
resource:
service.name: jaeger
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: debug
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

extensions:
healthcheckv2:
Expand Down
2 changes: 2 additions & 0 deletions cmd/jaeger/internal/all-in-one.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ service:
metrics:
level: detailed
address: 0.0.0.0:8888
logs:
level: info
# TODO Initialize telemetery tracer once OTEL released new feature.
# https://github.com/open-telemetry/opentelemetry-collector/issues/10663

Expand Down
26 changes: 20 additions & 6 deletions plugin/storage/es/spanstore/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,19 +135,33 @@ func NewSpanReader(p SpanReaderParams) *SpanReader {
spanIndex: p.SpanIndex,
serviceIndex: p.ServiceIndex,
spanConverter: dbmodel.NewToDomain(p.TagDotReplacement),
timeRangeIndices: getTimeRangeIndexFn(p.Archive, p.UseReadWriteAliases, p.RemoteReadClusters),
sourceFn: getSourceFn(p.Archive, p.MaxDocCount),
maxDocCount: p.MaxDocCount,
useReadWriteAliases: p.UseReadWriteAliases,
logger: p.Logger,
tracer: p.Tracer,
timeRangeIndices: getLoggingTimeRangeIndexFn(
p.Logger,
getTimeRangeIndexFn(p.Archive, p.UseReadWriteAliases, p.RemoteReadClusters),
),
sourceFn: getSourceFn(p.Archive, p.MaxDocCount),
maxDocCount: p.MaxDocCount,
useReadWriteAliases: p.UseReadWriteAliases,
logger: p.Logger,
tracer: p.Tracer,
}
}

type timeRangeIndexFn func(indexName string, indexDateLayout string, startTime time.Time, endTime time.Time, reduceDuration time.Duration) []string

type sourceFn func(query elastic.Query, nextTime uint64) *elastic.SearchSource

func getLoggingTimeRangeIndexFn(logger *zap.Logger, fn timeRangeIndexFn) timeRangeIndexFn {
if !logger.Core().Enabled(zap.DebugLevel) {
return fn
}
return func(indexName string, indexDateLayout string, startTime time.Time, endTime time.Time, reduceDuration time.Duration) []string {
indices := fn(indexName, indexDateLayout, startTime, endTime, reduceDuration)
logger.Debug("Reading from ES indices", zap.Strings("index", indices))
return indices
}
}

func getTimeRangeIndexFn(archive, useReadWriteAliases bool, remoteReadClusters []string) timeRangeIndexFn {
if archive {
var archiveSuffix string
Expand Down
5 changes: 4 additions & 1 deletion plugin/storage/es/spanstore/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"go.opentelemetry.io/otel/sdk/trace/tracetest"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
"go.uber.org/zap/zaptest"

"github.com/jaegertracing/jaeger/internal/metricstest"
"github.com/jaegertracing/jaeger/model"
Expand Down Expand Up @@ -167,7 +168,9 @@ func TestNewSpanReader(t *testing.T) {
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
reader := NewSpanReader(test.params)
params := test.params
params.Logger = zaptest.NewLogger(t)
reader := NewSpanReader(params)
require.NotNil(t, reader)
assert.Equal(t, test.maxSpanAge, reader.maxSpanAge)
})
Expand Down
1 change: 1 addition & 0 deletions plugin/storage/es/spanstore/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ func (s *SpanWriter) WriteSpan(_ context.Context, span *model.Span) error {
s.writeService(serviceIndexName, jsonSpan)
}
s.writeSpan(spanIndexName, jsonSpan)
s.logger.Debug("Wrote span to ES index", zap.String("index", spanIndexName))
return nil
}

Expand Down
8 changes: 3 additions & 5 deletions plugin/storage/es/spanstore/writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,10 @@ func TestSpanWriter_WriteSpan(t *testing.T) {
expectedLogs []string
}{
{
caption: "span insertion error",

caption: "span insertion error",
serviceIndexExists: false,

expectedError: "",
expectedLogs: []string{},
expectedError: "",
expectedLogs: []string{"Wrote span to ES index"},
},
}
for _, tc := range testCases {
Expand Down
30 changes: 24 additions & 6 deletions plugin/storage/integration/integration.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ func (s *StorageIntegration) testGetServices(t *testing.T) {
found := s.waitForCondition(t, func(t *testing.T) bool {
var err error
actual, err = s.SpanReader.GetServices(context.Background())
require.NoError(t, err)
if err != nil {
t.Log(err)
return false
}
sort.Strings(actual)
t.Logf("Retrieved services: %v", actual)
if len(actual) > len(expected) {
Expand Down Expand Up @@ -246,7 +249,10 @@ func (s *StorageIntegration) testGetOperations(t *testing.T) {
var err error
actual, err = s.SpanReader.GetOperations(context.Background(),
spanstore.OperationQueryParameters{ServiceName: "example-service-1"})
require.NoError(t, err)
if err != nil {
t.Log(err)
return false
}
sort.Slice(actual, func(i, j int) bool {
return actual[i].Name < actual[j].Name
})
Expand Down Expand Up @@ -327,7 +333,10 @@ func (s *StorageIntegration) findTracesByQuery(t *testing.T, query *spanstore.Tr
found := s.waitForCondition(t, func(t *testing.T) bool {
var err error
traces, err = s.SpanReader.FindTraces(context.Background(), query)
require.NoError(t, err)
if err != nil {
t.Log(err)
return false
}
if len(expected) != len(traces) {
t.Logf("Expecting certain number of traces: expected: %d, actual: %d", len(expected), len(traces))
return false
Expand Down Expand Up @@ -464,7 +473,10 @@ func (s *StorageIntegration) testGetDependencies(t *testing.T) {
found := s.waitForCondition(t, func(t *testing.T) bool {
var err error
actual, err = s.DependencyReader.GetDependencies(context.Background(), time.Now(), 5*time.Minute)
require.NoError(t, err)
if err != nil {
t.Log(err)
return false
}
sort.Slice(actual, func(i, j int) bool {
return actual[i].Parent < actual[j].Parent
})
Expand Down Expand Up @@ -495,7 +507,10 @@ func (s *StorageIntegration) testGetThroughput(t *testing.T) {
_ = s.waitForCondition(t, func(t *testing.T) bool {
var err error
actual, err = s.SamplingStore.GetThroughput(start, start.Add(time.Second*time.Duration(10)))
require.NoError(t, err)
if err != nil {
t.Log(err)
return false
}
return assert.ObjectsAreEqualValues(expected, len(actual))
})
assert.Len(t, actual, expected)
Expand All @@ -517,7 +532,10 @@ func (s *StorageIntegration) testGetLatestProbability(t *testing.T) {
found := s.waitForCondition(t, func(t *testing.T) bool {
var err error
actual, err = s.SamplingStore.GetLatestProbabilities()
require.NoError(t, err)
if err != nil {
t.Log(err)
return false
}
return assert.ObjectsAreEqualValues(expected, actual)
})
if !assert.True(t, found) {
Expand Down
5 changes: 1 addition & 4 deletions scripts/spm-integration-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,6 @@ wait_for_services() {
echo "Waiting for services to be up and running..."
check_service_health "Jaeger" "http://localhost:16686"
check_service_health "Prometheus" "http://localhost:9090/graph"
# Grafana is not actually important for the functional test,
# but we still validate that the docker-compose file is correct.
check_service_health "Grafana" "http://localhost:3000"
}

# Function to validate the service metrics
Expand Down Expand Up @@ -111,7 +108,7 @@ validate_service_metrics() {
echo "⏳ Expecting at least 4 non-zero data points"
return 1
fi

# Validate if labels are correct
local url="http://localhost:16686/api/metrics/calls?service=${service}&groupByOperation=true&endTs=&lookback=${fiveMinutes}&step=${fifteenSec}&ratePer=${oneMinute}"

Expand Down

0 comments on commit 772d84c

Please sign in to comment.