From 53eadb0741688cbfa5586bb569bb0e085ce1a13c Mon Sep 17 00:00:00 2001 From: Sergey Belyakov Date: Thu, 13 Jun 2024 10:13:33 +0300 Subject: [PATCH] Merge tracing fixes to 24-1 (#5458) --- .../blobstorage_pdisk_completion_impl.cpp | 8 +- .../pdisk/blobstorage_pdisk_completion_impl.h | 2 +- .../pdisk/blobstorage_pdisk_impl.cpp | 24 +- .../pdisk/blobstorage_pdisk_impl_log.cpp | 6 +- .../pdisk/blobstorage_pdisk_req_creator.h | 8 +- .../pdisk/blobstorage_pdisk_requestimpl.h | 6 +- ydb/core/cms/console/configs_dispatcher.cpp | 3 +- .../console/jaeger_tracing_configurator.cpp | 221 +++++ .../cms/console/jaeger_tracing_configurator.h | 13 + .../jaeger_tracing_configurator_ut.cpp | 816 ++++++++++++++++++ ydb/core/cms/console/ut/ya.make | 1 + ydb/core/cms/console/ya.make | 2 + ydb/core/cms/json_proxy_proto.h | 10 - .../common_controls/tracing_control.cpp | 62 -- .../control/common_controls/tracing_control.h | 33 - ydb/core/control/common_controls/ya.make | 13 - .../control/immediate_control_board_sampler.h | 25 - .../immediate_control_board_sampler_ut.cpp | 62 -- .../immediate_control_board_throttler.h | 66 -- .../immediate_control_board_throttler_ut.cpp | 126 --- ydb/core/control/ut/ya.make | 2 - ydb/core/control/ya.make | 6 - ydb/core/driver_lib/run/factories.h | 2 +- .../run/kikimr_services_initializers.cpp | 85 +- ydb/core/driver_lib/run/ya.make | 1 + ydb/core/grpc_services/base/base.h | 47 +- ydb/core/grpc_services/base/ya.make | 1 + .../grpc_services/grpc_request_check_actor.h | 39 +- ydb/core/grpc_services/grpc_request_proxy.cpp | 91 +- ydb/core/grpc_services/grpc_request_proxy.h | 3 +- .../grpc_services/rpc_begin_transaction.cpp | 2 +- ydb/core/grpc_services/rpc_calls.h | 1 + .../grpc_services/rpc_commit_transaction.cpp | 2 +- .../rpc_common/rpc_common_kqp_session.cpp | 25 +- ydb/core/grpc_services/rpc_deferrable.h | 8 + ydb/core/grpc_services/rpc_discovery.cpp | 4 + .../grpc_services/rpc_execute_data_query.cpp | 2 +- .../rpc_execute_scheme_query.cpp | 2 +- .../grpc_services/rpc_execute_yql_script.cpp | 2 +- .../grpc_services/rpc_explain_data_query.cpp | 2 +- .../grpc_services/rpc_explain_yql_script.cpp | 2 +- ydb/core/grpc_services/rpc_load_rows.cpp | 9 +- .../grpc_services/rpc_prepare_data_query.cpp | 2 +- ydb/core/grpc_services/rpc_read_rows.cpp | 10 +- .../rpc_rollback_transaction.cpp | 2 +- ydb/core/grpc_services/ya.make | 1 + .../jaeger_tracing/request_discriminator.cpp | 10 + .../jaeger_tracing/request_discriminator.h | 127 +++ ydb/core/jaeger_tracing/sampler.h | 23 + ydb/core/jaeger_tracing/sampler_ut.cpp | 41 + .../sampling_throttling_configurator.cpp | 84 ++ .../sampling_throttling_configurator.h | 46 + .../sampling_throttling_control.cpp | 28 + .../sampling_throttling_control.h | 32 + .../sampling_throttling_control_internals.cpp | 62 ++ .../sampling_throttling_control_internals.h | 25 + ydb/core/jaeger_tracing/settings.h | 165 ++++ ydb/core/jaeger_tracing/throttler.cpp | 78 ++ ydb/core/jaeger_tracing/throttler.h | 27 + ydb/core/jaeger_tracing/throttler_ut.cpp | 175 ++++ ydb/core/jaeger_tracing/ut/ya.make | 14 + ydb/core/jaeger_tracing/ya.make | 26 + ydb/core/keyvalue/keyvalue_intermediate.cpp | 4 +- .../keyvalue_storage_read_request.cpp | 2 +- .../keyvalue/keyvalue_storage_request.cpp | 2 +- ydb/core/keyvalue/ya.make | 1 - ydb/core/kqp/session_actor/kqp_query_state.h | 2 +- ydb/core/protos/config.proto | 142 +-- ydb/core/tablet/tablet_req_writelog.cpp | 8 +- ydb/core/tablet_flat/flat_exec_seat.h | 4 +- ydb/core/tablet_flat/flat_executor.cpp | 2 +- ydb/core/tablet_flat/tablet_flat_executor.h | 10 +- ydb/core/testlib/test_client.cpp | 8 +- ydb/core/tx/datashard/datashard.cpp | 24 +- ydb/core/tx/datashard/datashard__op_rows.cpp | 2 +- .../tx/datashard/datashard__read_iterator.cpp | 2 +- ydb/core/tx/datashard/datashard_pipeline.cpp | 2 +- ydb/core/tx/datashard/datashard_ut_trace.cpp | 75 +- ydb/core/tx/datashard/export_common.h | 1 + .../tx/tx_proxy/upload_rows_common_impl.h | 34 +- ydb/core/util/wilson.h | 14 + ydb/core/util/ya.make | 2 + ydb/core/ya.make | 1 + ydb/library/actors/wilson/wilson_span.cpp | 14 + ydb/library/actors/wilson/wilson_span.h | 11 +- ydb/library/actors/wilson/wilson_trace.cpp | 4 +- ydb/library/actors/wilson/wilson_trace.h | 2 +- ydb/library/actors/wilson/wilson_uploader.cpp | 345 ++++++-- ydb/library/actors/wilson/wilson_uploader.h | 15 +- ydb/library/services/services.proto | 1 + ydb/library/wilson_ids/wilson.h | 94 +- ydb/services/keyvalue/grpc_service.cpp | 32 +- ydb/services/local_discovery/grpc_func_call.h | 8 + ydb/services/local_discovery/grpc_service.cpp | 24 +- ydb/services/ydb/ydb_query.cpp | 26 +- ydb/services/ydb/ydb_table.cpp | 69 +- ydb/tools/cfg/static.py | 155 +++- ydb/tools/cfg/validation.py | 94 +- 98 files changed, 3080 insertions(+), 979 deletions(-) create mode 100644 ydb/core/cms/console/jaeger_tracing_configurator.cpp create mode 100644 ydb/core/cms/console/jaeger_tracing_configurator.h create mode 100644 ydb/core/cms/console/jaeger_tracing_configurator_ut.cpp delete mode 100644 ydb/core/control/common_controls/tracing_control.cpp delete mode 100644 ydb/core/control/common_controls/tracing_control.h delete mode 100644 ydb/core/control/common_controls/ya.make delete mode 100644 ydb/core/control/immediate_control_board_sampler.h delete mode 100644 ydb/core/control/immediate_control_board_sampler_ut.cpp delete mode 100644 ydb/core/control/immediate_control_board_throttler.h delete mode 100644 ydb/core/control/immediate_control_board_throttler_ut.cpp create mode 100644 ydb/core/jaeger_tracing/request_discriminator.cpp create mode 100644 ydb/core/jaeger_tracing/request_discriminator.h create mode 100644 ydb/core/jaeger_tracing/sampler.h create mode 100644 ydb/core/jaeger_tracing/sampler_ut.cpp create mode 100644 ydb/core/jaeger_tracing/sampling_throttling_configurator.cpp create mode 100644 ydb/core/jaeger_tracing/sampling_throttling_configurator.h create mode 100644 ydb/core/jaeger_tracing/sampling_throttling_control.cpp create mode 100644 ydb/core/jaeger_tracing/sampling_throttling_control.h create mode 100644 ydb/core/jaeger_tracing/sampling_throttling_control_internals.cpp create mode 100644 ydb/core/jaeger_tracing/sampling_throttling_control_internals.h create mode 100644 ydb/core/jaeger_tracing/settings.h create mode 100644 ydb/core/jaeger_tracing/throttler.cpp create mode 100644 ydb/core/jaeger_tracing/throttler.h create mode 100644 ydb/core/jaeger_tracing/throttler_ut.cpp create mode 100644 ydb/core/jaeger_tracing/ut/ya.make create mode 100644 ydb/core/jaeger_tracing/ya.make create mode 100644 ydb/core/util/wilson.h diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp index 90f22ebd94ae..9039624714c2 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp @@ -15,7 +15,7 @@ namespace NPDisk { void TCompletionLogWrite::Exec(TActorSystem *actorSystem) { // bool isNewChunksCommited = false; if (CommitedLogChunks) { - NWilson::TSpan span(TWilson::PDisk, TraceId.Clone(), "PDisk.CommitLogChunks"); + NWilson::TSpan span(TWilson::PDiskBasic, TraceId.Clone(), "PDisk.CommitLogChunks"); auto* req = PDisk->ReqCreator.CreateFromArgs(std::move(CommitedLogChunks), std::move(span)); PDisk->InputRequest(req); //isNewChunksCommited = true; @@ -146,7 +146,7 @@ TBuffer *TCompletionChunkReadPart::GetBuffer() { } void TCompletionChunkReadPart::Exec(TActorSystem *actorSystem) { - auto execSpan = Span.CreateChild(TWilson::PDisk, "PDisk.CompletionChunkReadPart.Exec"); + auto execSpan = Span.CreateChild(TWilson::PDiskDetailed, "PDisk.CompletionChunkReadPart.Exec"); Y_ABORT_UNLESS(actorSystem); Y_ABORT_UNLESS(CumulativeCompletion); if (TCompletionAction::Result != EIoResult::Ok) { @@ -306,7 +306,7 @@ TCompletionChunkRead::~TCompletionChunkRead() { } void TCompletionChunkRead::Exec(TActorSystem *actorSystem) { - auto execSpan = Span.CreateChild(TWilson::PDisk, "PDisk.CompletionChunkRead.Exec"); + auto execSpan = Span.CreateChild(TWilson::PDiskDetailed, "PDisk.CompletionChunkRead.Exec"); THolder result = MakeHolder(NKikimrProto::OK, Read->ChunkIdx, Read->Offset, Read->Cookie, PDisk->GetStatusFlags(Read->Owner, Read->OwnerGroupType), ""); result->Data = std::move(CommonBuffer); @@ -393,7 +393,7 @@ void TChunkTrimCompletion::Exec(TActorSystem *actorSystem) { << ui64(responseTimeMs) << " sizeBytes# " << SizeBytes); LWPROBE(PDiskTrimResponseTime, PDisk->PDiskId, ReqId.Id, responseTimeMs, SizeBytes); PDisk->Mon.Trim.CountResponse(); - NWilson::TSpan span(TWilson::PDisk, std::move(TraceId), "PDisk.TryTrimChunk", NWilson::EFlags::AUTO_END, actorSystem); + NWilson::TSpan span(TWilson::PDiskBasic, std::move(TraceId), "PDisk.TryTrimChunk", NWilson::EFlags::AUTO_END, actorSystem); span.Attribute("size", static_cast(SizeBytes)); TTryTrimChunk *tryTrim = PDisk->ReqCreator.CreateFromArgs(SizeBytes, std::move(span)); PDisk->InputRequest(tryTrim); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h index 415f5a2549e3..5ae63cc4384f 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h @@ -100,7 +100,7 @@ class TCompletionChunkWrite : public TCompletionAction { } void Exec(TActorSystem *actorSystem) override { - auto execSpan = Span.CreateChild(TWilson::PDisk, "PDisk.CompletionChunkWrite.Exec"); + auto execSpan = Span.CreateChild(TWilson::PDiskDetailed, "PDisk.CompletionChunkWrite.Exec"); double responseTimeMs = HPMilliSecondsFloat(HPNow() - StartTime); LOG_DEBUG_S(*actorSystem, NKikimrServices::BS_PDISK, "PDiskId# " << PDiskId << " ReqId# " << ReqId diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp index 310bdced23dc..e7249e8bb52b 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp @@ -1051,7 +1051,7 @@ TPDisk::EChunkReadPieceResult TPDisk::ChunkReadPiece(TIntrusivePtr & ui64 readOffset = Format.Offset(read->ChunkIdx, read->FirstSector, currentSectorOffset); // TODO: Get this from the drive - NWilson::TSpan span(TWilson::PDisk, std::move(traceId), "PDisk.CompletionChunkReadPart", NWilson::EFlags::NONE, ActorSystem); + NWilson::TSpan span(TWilson::PDiskBasic, std::move(traceId), "PDisk.CompletionChunkReadPart", NWilson::EFlags::NONE, ActorSystem); traceId = span.GetTraceId(); THolder completion(new TCompletionChunkReadPart(this, read, bytesToRead, payloadBytesToRead, payloadOffset, read->FinalCompletion, isTheLastPart, Cfg->UseT1ha0HashInFooter, std::move(span))); @@ -2250,7 +2250,7 @@ void TPDisk::ProcessChunkWriteQueue() { for (auto it = JointChunkWrites.begin(); it != JointChunkWrites.end(); ++it) { TRequestBase *req = (*it); req->SpanStack.PopOk(); - req->SpanStack.Push(TWilson::PDisk, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); + req->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); switch (req->GetType()) { case ERequestType::RequestChunkWritePiece: { @@ -2281,7 +2281,7 @@ void TPDisk::ProcessChunkReadQueue() { for (auto& req : JointChunkReads) { req->SpanStack.PopOk(); - req->SpanStack.Push(TWilson::PDisk, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); + req->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); switch (req->GetType()) { case ERequestType::RequestChunkReadPiece: { @@ -2353,7 +2353,7 @@ void TPDisk::ProcessChunkTrimQueue() { for (auto it = JointChunkTrims.begin(); it != JointChunkTrims.end(); ++it) { TChunkTrim *trim = (*it); trim->SpanStack.PopOk(); - trim->SpanStack.Push(TWilson::PDisk, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); + trim->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); ui64 chunkOffset = Format.ChunkSize * ui64(trim->ChunkIdx); ui64 offset = chunkOffset + trim->Offset; ui64 trimSize = trim->Size; @@ -2890,7 +2890,7 @@ bool TPDisk::PreprocessRequest(TRequestBase *request) { --state.OperationsInProgress; --inFlight->ChunkReads; }; - auto completionSpan = request->SpanStack.CreateChild(TWilson::PDisk, "PDisk.CompletionChunkRead"); + auto completionSpan = request->SpanStack.CreateChild(TWilson::PDiskTopLevel, "PDisk.CompletionChunkRead"); read->FinalCompletion = new TCompletionChunkRead(this, read, std::move(onDestroy), state.Nonce, std::move(completionSpan)); static_cast(request)->SelfPointer = std::move(read); @@ -2976,7 +2976,7 @@ bool TPDisk::PreprocessRequest(TRequestBase *request) { }; ev.Completion = MakeHolder(ev.Sender, result.release(), &Mon, PDiskId, ev.CreationTime, ev.TotalSize, ev.PriorityClass, std::move(onDestroy), ev.ReqId, - ev.SpanStack.CreateChild(TWilson::PDisk, "PDisk.CompletionChunkWrite")); + ev.SpanStack.CreateChild(TWilson::PDiskTopLevel, "PDisk.CompletionChunkWrite")); return true; } @@ -3171,7 +3171,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { && static_cast(job->Payload)->GetType() == ERequestType::RequestLogWrite) { TLogWrite &batch = *static_cast(job->Payload); - if (auto span = request->SpanStack.Push(TWilson::PDisk, "PDisk.InScheduler.InLogWriteBatch")) { + if (auto span = request->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InScheduler.InLogWriteBatch")) { span->Attribute("Batch.ReqId", static_cast(batch.ReqId.Id)); } batch.AddToBatch(static_cast(request)); @@ -3197,7 +3197,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { SplitChunkJobSize(whole->TotalSize, &smallJobSize, &largeJobSize, &smallJobCount); for (ui32 idx = 0; idx < smallJobCount; ++idx) { // Schedule small job. - auto span = request->SpanStack.CreateChild(TWilson::PDisk, "PDisk.ChunkWritePiece", NWilson::EFlags::AUTO_END); + auto span = request->SpanStack.CreateChild(TWilson::PDiskBasic, "PDisk.ChunkWritePiece", NWilson::EFlags::AUTO_END); span.Attribute("small_job_idx", idx) .Attribute("is_last_piece", false); TChunkWritePiece *piece = new TChunkWritePiece(whole, idx * smallJobSize, smallJobSize, std::move(span)); @@ -3205,7 +3205,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { AddJobToForseti(cbs, piece, request->JobKind); } // Schedule large job (there always is one) - auto span = request->SpanStack.CreateChild(TWilson::PDisk, "PDisk.ChunkWritePiece", NWilson::EFlags::AUTO_END); + auto span = request->SpanStack.CreateChild(TWilson::PDiskBasic, "PDisk.ChunkWritePiece", NWilson::EFlags::AUTO_END); span.Attribute("is_last_piece", true); TChunkWritePiece *piece = new TChunkWritePiece(whole, smallJobCount * smallJobSize, largeJobSize, std::move(span)); piece->EstimateCost(DriveModel); @@ -3225,7 +3225,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { ui32 largeJobSize = totalSectors - smallJobSize * smallJobCount; for (ui32 idx = 0; idx < smallJobCount; ++idx) { - auto span = request->SpanStack.CreateChild(TWilson::PDisk, "PDisk.ChunkReadPiece", NWilson::EFlags::AUTO_END); + auto span = request->SpanStack.CreateChild(TWilson::PDiskBasic, "PDisk.ChunkReadPiece", NWilson::EFlags::AUTO_END); span.Attribute("small_job_idx", idx) .Attribute("is_last_piece", false); // Schedule small job. @@ -3238,7 +3238,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { AddJobToForseti(cbs, piece, request->JobKind); } // Schedule large job (there always is one) - auto span = request->SpanStack.CreateChild(TWilson::PDisk, "PDisk.ChunkReadPiece"); + auto span = request->SpanStack.CreateChild(TWilson::PDiskBasic, "PDisk.ChunkReadPiece"); span.Attribute("is_last_piece", true); auto piece = new TChunkReadPiece(read, smallJobCount * smallJobSize, largeJobSize * Format.SectorSize, true, std::move(span)); @@ -3274,7 +3274,7 @@ void TPDisk::SplitChunkJobSize(ui32 totalSize, ui32 *outSmallJobSize, ui32 *outL void TPDisk::AddJobToForseti(NSchLab::TCbs *cbs, TRequestBase *request, NSchLab::EJobKind jobKind) { LWTRACK(PDiskAddToScheduler, request->Orbit, PDiskId, request->ReqId.Id, HPSecondsFloat(request->CreationTime), request->Owner, request->IsFast, request->PriorityClass); - request->SpanStack.Push(TWilson::PDisk, "PDisk.InScheduler"); + request->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InScheduler"); TIntrusivePtr job = ForsetiScheduler.CreateJob(); job->Payload = request; job->Cost = request->Cost; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp index 44d5bb98a11c..d884c84ee794 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp @@ -535,7 +535,7 @@ void TPDisk::ReadAndParseMainLog(const TActorId &pDiskActor) { void TPDisk::ProcessLogReadQueue() { for (auto& req : JointLogReads) { req->SpanStack.PopOk(); - req->SpanStack.Push(TWilson::PDisk, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); + req->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); switch (req->GetType()) { case ERequestType::RequestLogRead: { @@ -736,7 +736,7 @@ void TPDisk::ProcessLogWriteQueueAndCommits() { TStringStream errorReason; NKikimrProto::EReplyStatus status = ValidateRequest(logWrite, errorReason); if (status == NKikimrProto::OK) { - logWrite->SpanStack.Push(TWilson::PDisk, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); + logWrite->SpanStack.Push(TWilson::PDiskDetailed, "PDisk.InBlockDevice", NWilson::EFlags::AUTO_END); LogWrite(*logWrite, logChunksToCommit); logWrite->ScheduleTime = HPNow(); if (auto logWriteTraceId = logWrite->SpanStack.GetTraceId()) { @@ -1282,7 +1282,7 @@ void TPDisk::MarkChunksAsReleased(TReleaseChunks& req) { } if (req.IsChunksFromLogSplice) { - auto *releaseReq = ReqCreator.CreateFromArgs(std::move(req.ChunksToRelease), req.SpanStack.CreateChild(TWilson::PDisk, "PDisk.ReleaseChunks")); + auto *releaseReq = ReqCreator.CreateFromArgs(std::move(req.ChunksToRelease), req.SpanStack.CreateChild(TWilson::PDiskTopLevel, "PDisk.ReleaseChunks")); auto flushAction = MakeHolder(this, THolder(releaseReq)); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_req_creator.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_req_creator.h index c8ab252f8f7d..dc0d534b4e1a 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_req_creator.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_req_creator.h @@ -218,7 +218,7 @@ class TReqCreator { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // TODO: Make all functions in style [[nodiscard]] TChunkTrim* CreateChunkTrim(ui32 chunkIdx, ui32 offset, ui64 size, const NWilson::TSpan& parent) { - NWilson::TSpan span = parent.CreateChild(TWilson::PDisk, "PDisk.ChunkTrim"); + NWilson::TSpan span = parent.CreateChild(TWilson::PDiskTopLevel, "PDisk.ChunkTrim"); span.Attribute("chunk_idx", chunkIdx) .Attribute("offset", offset) .Attribute("size", static_cast(size)) @@ -228,7 +228,7 @@ class TReqCreator { } [[nodiscard]] TLogWrite* CreateLogWrite(NPDisk::TEvLog &ev, const TActorId &sender, double& burstMs, NWilson::TTraceId traceId) { - NWilson::TSpan span(TWilson::PDisk, std::move(traceId), "PDisk.LogWrite", NWilson::EFlags::AUTO_END, ActorSystem); + NWilson::TSpan span(TWilson::PDiskTopLevel, std::move(traceId), "PDisk.LogWrite", NWilson::EFlags::AUTO_END, ActorSystem); span.Attribute("pdisk_id", PDiskId); TReqId reqId(TReqId::LogWrite, AtomicIncrement(LastReqId)); @@ -245,7 +245,7 @@ class TReqCreator { [[nodiscard]] TChunkRead* CreateChunkRead(const NPDisk::TEvChunkRead &ev, const TActorId &sender, double& burstMs, NWilson::TTraceId traceId) { - NWilson::TSpan span(TWilson::PDisk, std::move(traceId), "PDisk.ChunkRead", NWilson::EFlags::AUTO_END, ActorSystem); + NWilson::TSpan span(TWilson::PDiskTopLevel, std::move(traceId), "PDisk.ChunkRead", NWilson::EFlags::AUTO_END, ActorSystem); span.Attribute("pdisk_id", PDiskId); TReqId reqId(TReqId::ChunkRead, AtomicIncrement(LastReqId)); @@ -261,7 +261,7 @@ class TReqCreator { [[nodiscard]] TChunkWrite* CreateChunkWrite(const NPDisk::TEvChunkWrite &ev, const TActorId &sender, double& burstMs, NWilson::TTraceId traceId) { - NWilson::TSpan span(TWilson::PDisk, std::move(traceId), "PDisk.ChunkWrite", NWilson::EFlags::AUTO_END, ActorSystem); + NWilson::TSpan span(TWilson::PDiskTopLevel, std::move(traceId), "PDisk.ChunkWrite", NWilson::EFlags::AUTO_END, ActorSystem); span.Attribute("pdisk_id", PDiskId); TReqId reqId(TReqId::ChunkWrite, AtomicIncrement(LastReqId)); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_requestimpl.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_requestimpl.h index ee89bc085331..d8d6da23393d 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_requestimpl.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_requestimpl.h @@ -185,7 +185,7 @@ class TLogRead : public TRequestBase { TLogRead(const NPDisk::TEvReadLog::TPtr &ev, ui32 pdiskId, TAtomicBase reqIdx) : TRequestBase(ev->Sender, TReqId(TReqId::LogRead, reqIdx), ev->Get()->Owner, ev->Get()->OwnerRound, NPriInternal::LogRead, - NWilson::TSpan(TWilson::PDisk, std::move(ev->TraceId), "PDisk.LogRead")) + NWilson::TSpan(TWilson::PDiskTopLevel, std::move(ev->TraceId), "PDisk.LogRead")) , Position(ev->Get()->Position) , SizeLimit(ev->Get()->SizeLimit) { @@ -214,7 +214,7 @@ class TLogReadContinue : public TRequestBase { TLogReadContinue(const NPDisk::TEvReadLogContinue::TPtr &ev, ui32 pdiskId, TAtomicBase /*reqIdx*/) : TRequestBase(ev->Sender, ev->Get()->ReqId, 0, 0, NPriInternal::LogRead, - NWilson::TSpan(TWilson::PDisk, std::move(ev->TraceId), "PDisk.LogReadContinue")) + NWilson::TSpan(TWilson::PDiskTopLevel, std::move(ev->TraceId), "PDisk.LogReadContinue")) , Data(ev->Get()->Data) , Size(ev->Get()->Size) , Offset(ev->Get()->Offset) @@ -850,7 +850,7 @@ class TAskForCutLog : public TRequestBase { public: TAskForCutLog(const NPDisk::TEvAskForCutLog::TPtr &ev, ui32 pdiskId, TAtomicBase reqIdx) : TRequestBase(ev->Sender, TReqId(TReqId::AskForCutLog, reqIdx), ev->Get()->Owner, ev->Get()->OwnerRound, NPriInternal::Other, - NWilson::TSpan(TWilson::PDisk, std::move(ev->TraceId), "PDisk.AskForCutLog") + NWilson::TSpan(TWilson::PDiskTopLevel, std::move(ev->TraceId), "PDisk.AskForCutLog") ) { if (auto span = SpanStack.PeekTop()) { diff --git a/ydb/core/cms/console/configs_dispatcher.cpp b/ydb/core/cms/console/configs_dispatcher.cpp index d1834fe1018d..f415cf3243c0 100644 --- a/ydb/core/cms/console/configs_dispatcher.cpp +++ b/ydb/core/cms/console/configs_dispatcher.cpp @@ -57,7 +57,8 @@ const THashSet DYNAMIC_KINDS({ (ui32)NKikimrConsole::TConfigItem::TenantPoolConfigItem, (ui32)NKikimrConsole::TConfigItem::TenantSlotBrokerConfigItem, (ui32)NKikimrConsole::TConfigItem::AllowEditYamlInUiItem, - (ui32)NKikimrConsole::TConfigItem::BackgroundCleaningConfigItem + (ui32)NKikimrConsole::TConfigItem::BackgroundCleaningConfigItem, + (ui32)NKikimrConsole::TConfigItem::TracingConfigItem, }); const THashSet NON_YAML_KINDS({ diff --git a/ydb/core/cms/console/jaeger_tracing_configurator.cpp b/ydb/core/cms/console/jaeger_tracing_configurator.cpp new file mode 100644 index 000000000000..b20fa0429eb5 --- /dev/null +++ b/ydb/core/cms/console/jaeger_tracing_configurator.cpp @@ -0,0 +1,221 @@ +#include "jaeger_tracing_configurator.h" + +#include "configs_dispatcher.h" +#include "console.h" + +#include +#include +#include + +namespace NKikimr::NConsole { + +using namespace NJaegerTracing; + +class TJaegerTracingConfigurator : public TActorBootstrapped { +public: + static constexpr NKikimrServices::TActivity::EType ActorActivityType() { + return NKikimrServices::TActivity::JAEGER_TRACING_CONFIGURATOR; + } + + TJaegerTracingConfigurator(TSamplingThrottlingConfigurator tracingConfigurator, + NKikimrConfig::TTracingConfig cfg); + + void Bootstrap(const TActorContext& ctx); + +private: + void Handle(TEvConsole::TEvConfigNotificationRequest::TPtr& ev, const TActorContext& ctx); + + STRICT_STFUNC(StateWork, + HFunc(TEvConsole::TEvConfigNotificationRequest, Handle) + IgnoreFunc(TEvConfigsDispatcher::TEvSetConfigSubscriptionResponse) + ) + + void ApplyConfigs(const NKikimrConfig::TTracingConfig& cfg); + static TVector GetRequestTypes(const NKikimrConfig::TTracingConfig::TSelectors& selectors); + static TMaybe GetDatabase(const NKikimrConfig::TTracingConfig::TSelectors& selectors); + static TSettings> GetSettings(const NKikimrConfig::TTracingConfig& cfg); + + TSamplingThrottlingConfigurator TracingConfigurator; + NKikimrConfig::TTracingConfig initialConfig; +}; + +TJaegerTracingConfigurator::TJaegerTracingConfigurator( + TSamplingThrottlingConfigurator tracingConfigurator, + NKikimrConfig::TTracingConfig cfg) + : TracingConfigurator(std::move(tracingConfigurator)) + , initialConfig(std::move(cfg)) +{} + +void TJaegerTracingConfigurator::Bootstrap(const TActorContext& ctx) { + LOG_DEBUG_S(ctx, NKikimrServices::CMS_CONFIGS, "TJaegerTracingConfigurator: Bootstrap"); + Become(&TThis::StateWork); + + ApplyConfigs(initialConfig); + + LOG_DEBUG_S(ctx, NKikimrServices::CMS_CONFIGS, "TJaegerTracingConfigurator: subscribing to config updates"); + ui32 item = static_cast(NKikimrConsole::TConfigItem::TracingConfigItem); + ctx.Send(MakeConfigsDispatcherID(SelfId().NodeId()), + new TEvConfigsDispatcher::TEvSetConfigSubscriptionRequest(item)); +} + +void TJaegerTracingConfigurator::Handle(TEvConsole::TEvConfigNotificationRequest::TPtr& ev, const TActorContext& ctx) { + auto& rec = ev->Get()->Record; + + LOG_INFO_S(ctx, NKikimrServices::CMS_CONFIGS, "TJaegerTracingConfigurator: got new config: " << rec.GetConfig().ShortDebugString()); + + ApplyConfigs(rec.GetConfig().GetTracingConfig()); + + auto resp = MakeHolder(rec); + LOG_TRACE_S(ctx, NKikimrServices::CMS_CONFIGS, + "TJaegerTracingConfigurator: Send TEvConfigNotificationResponse"); + ctx.Send(ev->Sender, resp.Release(), 0, ev->Cookie); +} + +void TJaegerTracingConfigurator::ApplyConfigs(const NKikimrConfig::TTracingConfig& cfg) { + auto settings = GetSettings(cfg); + return TracingConfigurator.UpdateSettings(std::move(settings)); +} + +TVector TJaegerTracingConfigurator::GetRequestTypes(const NKikimrConfig::TTracingConfig::TSelectors& selectors) { + TVector requestTypes; + bool hasErrors = false; + for (const auto& requestType: selectors.GetRequestTypes()) { + if (auto it = NameToRequestType.FindPtr(requestType)) { + requestTypes.push_back(*it); + } else { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "Failed to parse request type \"" << requestType << "\""); + hasErrors = true; + } + } + + if (hasErrors) { + return {}; + } + if (requestTypes.empty()) { + requestTypes.push_back(ERequestType::UNSPECIFIED); + } + return requestTypes; +} + +TMaybe TJaegerTracingConfigurator::GetDatabase(const NKikimrConfig::TTracingConfig::TSelectors& selectors) { + if (selectors.HasDatabase()) { + return selectors.GetDatabase(); + } + return NothingObject; +} + +TSettings> TJaegerTracingConfigurator::GetSettings(const NKikimrConfig::TTracingConfig& cfg) { + TSettings> settings; + + size_t tag = 0; + + for (const auto& samplingRule : cfg.GetSampling()) { + const auto& scope = samplingRule.GetScope(); + + auto requestTypes = GetRequestTypes(scope); + if (requestTypes.empty()) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "failed to parse request type in the rule " + << samplingRule.ShortDebugString() << ". Skipping the rule"); + continue; + } + + if (!samplingRule.HasLevel() || !samplingRule.HasFraction() || !samplingRule.HasMaxTracesPerMinute()) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "missing required fields in rule " << samplingRule.ShortDebugString() + << " (required fields are: level, fraction, max_traces_per_minute). Skipping the rule"); + continue; + } + if (samplingRule.GetMaxTracesPerMinute() == 0) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "max_traces_per_minute should never be zero. Found in rule " << samplingRule.GetMaxTracesPerMinute() + << ". Skipping the rule"); + continue; + } + + ui64 level = samplingRule.GetLevel(); + double fraction = samplingRule.GetFraction(); + if (level > 15) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "sampling level exceeds maximum allowed value (" << level + << " provided, maximum is 15). Lowering the level"); + level = 15; + } + if (fraction < 0 || fraction > 1) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "provided fraction " << fraction + << " violated range [0; 1]. Clamping it to the range"); + fraction = std::min(1.0, std::max(0.0, fraction)); + } + + TSamplingRule> rule { + .Level = static_cast(level), + .Sampler = fraction, + .Throttler = TWithTag { + .Value = TThrottlingSettings { + .MaxTracesPerMinute = samplingRule.GetMaxTracesPerMinute(), + .MaxTracesBurst = samplingRule.GetMaxTracesBurst(), + }, + .Tag = tag++, + }, + }; + + for (auto requestType: requestTypes) { + auto& requestTypeRules = settings.SamplingRules[static_cast(requestType)]; + auto database = GetDatabase(scope); + if (database) { + requestTypeRules.DatabaseRules[*database].push_back(rule); + } else { + requestTypeRules.Global.push_back(rule); + } + } + } + + for (const auto& throttlingRule : cfg.GetExternalThrottling()) { + const auto& scope = throttlingRule.GetScope(); + + auto requestTypes = GetRequestTypes(throttlingRule.GetScope()); + if (requestTypes.empty()) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "failed to parse request type in rule " + << throttlingRule.ShortDebugString() << ". Skipping the rule"); + continue; + } + + if (!throttlingRule.HasMaxTracesPerMinute()) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "missing required field max_traces_per_minute in rule " + << throttlingRule.ShortDebugString() << ". Skipping the rule"); + continue; + } + if (throttlingRule.GetMaxTracesPerMinute() == 0) { + ALOG_ERROR(NKikimrServices::CMS_CONFIGS, "max_traces_per_minute should never be zero. Found in rule " << throttlingRule.GetMaxTracesPerMinute() + << ". Skipping the rule"); + continue; + } + + ui64 maxRatePerMinute = throttlingRule.GetMaxTracesPerMinute(); + ui64 maxBurst = throttlingRule.GetMaxTracesBurst(); + TExternalThrottlingRule> rule { + .Throttler = TWithTag { + .Value = TThrottlingSettings { + .MaxTracesPerMinute = maxRatePerMinute, + .MaxTracesBurst = maxBurst, + }, + .Tag = tag++, + } + }; + + for (auto requestType : requestTypes) { + auto& requestTypeRules = settings.ExternalThrottlingRules[static_cast(requestType)]; + auto database = GetDatabase(scope); + if (database) { + requestTypeRules.DatabaseRules[*database].push_back(rule); + } else { + requestTypeRules.Global.push_back(rule); + } + } + } + + return settings; +} + +IActor* CreateJaegerTracingConfigurator(TSamplingThrottlingConfigurator tracingConfigurator, + NKikimrConfig::TTracingConfig cfg) { + return new TJaegerTracingConfigurator(std::move(tracingConfigurator), std::move(cfg)); +} + +} // namespace NKikimr::NConsole diff --git a/ydb/core/cms/console/jaeger_tracing_configurator.h b/ydb/core/cms/console/jaeger_tracing_configurator.h new file mode 100644 index 000000000000..54b22bd0741c --- /dev/null +++ b/ydb/core/cms/console/jaeger_tracing_configurator.h @@ -0,0 +1,13 @@ +#pragma once + +#include "defs.h" + +#include +#include + +namespace NKikimr::NConsole { + +IActor* CreateJaegerTracingConfigurator(NJaegerTracing::TSamplingThrottlingConfigurator tracingConfigurator, + NKikimrConfig::TTracingConfig cfg); + +} // namespace NKikimr::NConsole diff --git a/ydb/core/cms/console/jaeger_tracing_configurator_ut.cpp b/ydb/core/cms/console/jaeger_tracing_configurator_ut.cpp new file mode 100644 index 000000000000..8867c49b4522 --- /dev/null +++ b/ydb/core/cms/console/jaeger_tracing_configurator_ut.cpp @@ -0,0 +1,816 @@ +#include "ut_helpers.h" +#include "jaeger_tracing_configurator.h" + +#include + +#include +#include + +#include + +namespace NKikimr { + +using namespace NConsole; +using namespace NUT; +using namespace NJaegerTracing; + +namespace { + +TTenantTestConfig::TTenantPoolConfig StaticTenantPoolConfig() { + TTenantTestConfig::TTenantPoolConfig res = { + // Static slots {tenant, {cpu, memory, network}} + {{ {DOMAIN1_NAME, {1, 1, 1}} }}, + // NodeType + "type1" + }; + return res; +} + +TTenantTestConfig DefaultConsoleTestConfig() { + TTenantTestConfig res = { + // Domains {name, schemeshard {{ subdomain_names }}} + {{ {DOMAIN1_NAME, SCHEME_SHARD1_ID, TVector()} }}, + // HiveId + HIVE_ID, + // FakeTenantSlotBroker + true, + // FakeSchemeShard + false, + // CreateConsole + true, + // Nodes {tenant_pool_config, data_center} + {{ + {StaticTenantPoolConfig()}, + }}, + // DataCenterCount + 1, + // CreateConfigsDispatcher + true + }; + return res; +} + +void InitJaegerTracingConfigurator( + TTenantTestRuntime& runtime, + TSamplingThrottlingConfigurator configurator, + const NKikimrConfig::TTracingConfig& initCfg +) { + runtime.Register(CreateJaegerTracingConfigurator(std::move(configurator), initCfg)); + + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvConfigsDispatcher::EvSetConfigSubscriptionResponse, 1); + runtime.DispatchEvents(std::move(options)); +} + +void WaitForUpdate(TTenantTestRuntime& runtime) { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvConsole::EvConfigNotificationResponse, 1); + runtime.DispatchEvents(std::move(options)); +} + +void ConfigureAndWaitUpdate(TTenantTestRuntime& runtime, const NKikimrConfig::TTracingConfig& cfg, ui32 order) { + auto configItem = MakeConfigItem(NKikimrConsole::TConfigItem::TracingConfigItem, + NKikimrConfig::TAppConfig(), {}, {}, "", "", order, + NKikimrConsole::TConfigItem::OVERWRITE, ""); + configItem.MutableConfig()->MutableTracingConfig()->CopyFrom(cfg); + + auto* event = new TEvConsole::TEvConfigureRequest; + event->Record.AddActions()->CopyFrom(MakeAddAction(configItem)); + + runtime.SendToConsole(event); + WaitForUpdate(runtime); +} + +auto& RandomChoice(auto& Container) { + return Container[RandomNumber() % Container.size()]; +} + +class TTracingControls { +public: + enum ETraceState { + OFF, + SAMPLED, + EXTERNAL, + }; + + TTracingControls(TVector> controls) + : Controls(std::move(controls)) + {} + + std::pair HandleTracing(bool isExternal, TRequestDiscriminator discriminator) { + auto& control = RandomChoice(Controls); + + NWilson::TTraceId traceId; + if (isExternal) { + traceId = NWilson::TTraceId::NewTraceId(TComponentTracingLevels::ProductionVerbose, Max()); + } + auto before = traceId.Clone(); + + control->HandleTracing(traceId, discriminator); + if (!traceId) { + return {OFF, 0}; + } + + ETraceState state; + if (traceId == before) { + state = ETraceState::EXTERNAL; + } else { + state = ETraceState::SAMPLED; + } + + return {state, traceId.GetVerbosity()}; + } + +private: + TVector> Controls; +}; + +std::pair + CreateSamplingThrottlingConfigurator(size_t n, TIntrusivePtr timeProvider) { + auto randomProvider = CreateDefaultRandomProvider(); + TSamplingThrottlingConfigurator configurator(timeProvider, randomProvider); + TVector> controls; + for (size_t i = 0; i < n; ++i) { + controls.emplace_back(configurator.GetControl()); + } + + return {TTracingControls(std::move(controls)), std::move(configurator)}; +} + +struct TTimeProviderMock : public ITimeProvider { + TTimeProviderMock(TInstant now) : Now_(now) {} + + TInstant Now() override { + return Now_; + } + + void Advance(TDuration delta) { + Now_ += delta; + } + + TInstant Now_; +}; + +} // namespace anonymous + +Y_UNIT_TEST_SUITE(TJaegerTracingConfiguratorTests) { + Y_UNIT_TEST(DefaultConfig) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + InitJaegerTracingConfigurator(runtime, std::move(configurator), {}); + + for (size_t i = 0; i < 100; ++i) { + auto [state, _] = controls.HandleTracing(false, {}); + UNIT_ASSERT_EQUAL(state, TTracingControls::OFF); // No requests are sampled + } + + for (size_t i = 0; i < 100; ++i) { + auto [state, _] = controls.HandleTracing(true, {}); + UNIT_ASSERT_EQUAL(state, TTracingControls::OFF); // No request with trace-id are traced + } + WaitForUpdate(runtime); // Initial update + } + + Y_UNIT_TEST(GlobalRules) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddExternalThrottling(); + rule->SetMaxTracesBurst(0); + rule->SetMaxTracesPerMinute(60); + } + { + auto rule = cfg.AddSampling(); + rule->SetFraction(1. / 3); + rule->SetLevel(5); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(30); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + std::array discriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/test3", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::KEYVALUE_READ, + }, + TRequestDiscriminator{ + .Database = "/Root/test2", + }, + TRequestDiscriminator{}, + }; + + { + size_t sampled = 0; + size_t traced = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(true, RandomChoice(discriminators)); + + switch (state) { + case TTracingControls::OFF: + break; + case TTracingControls::SAMPLED: + UNIT_ASSERT_EQUAL(level, 5); + ++sampled; + break; + case TTracingControls::EXTERNAL: + ++traced; + break; + } + timeProvider->Advance(TDuration::MilliSeconds(250)); + } + UNIT_ASSERT_EQUAL(traced, 250); + UNIT_ASSERT(sampled >= 110 && sampled <= 135); + } + timeProvider->Advance(TDuration::Minutes(1)); + + { + for (size_t i = 0; i < 100; ++i) { + auto [state, _] = controls.HandleTracing(true, RandomChoice(discriminators)); + UNIT_ASSERT_EQUAL(state, TTracingControls::EXTERNAL); + timeProvider->Advance(TDuration::Seconds(1)); + } + } + timeProvider->Advance(TDuration::Minutes(1)); + + { + size_t sampled = 0; + for (size_t i = 0; i < 750; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(discriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + ++sampled; + UNIT_ASSERT_EQUAL(level, 5); + } + timeProvider->Advance(TDuration::Seconds(1)); + } + UNIT_ASSERT(sampled >= 210 && sampled <= 300); + } + } + + Y_UNIT_TEST(RequestTypeThrottler) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddExternalThrottling(); + rule->SetMaxTracesBurst(5); + rule->SetMaxTracesPerMinute(120); + rule->MutableScope()->AddRequestTypes()->assign("KeyValue.ExecuteTransaction"); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + for (size_t i = 0; i < 100; ++i) { + auto [state, _] = controls.HandleTracing(false, {}); + UNIT_ASSERT_EQUAL(state, TTracingControls::OFF); // No requests are sampled + } + + UNIT_ASSERT_EQUAL(controls.HandleTracing(true, {}).first, TTracingControls::OFF); // No request type + UNIT_ASSERT_EQUAL(controls.HandleTracing(true, {.RequestType = ERequestType::KEYVALUE_READ}).first, + TTracingControls::OFF); // Wrong request type + std::array executeTransactionDiscriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::KEYVALUE_EXECUTETRANSACTION, + }, + TRequestDiscriminator{ + .RequestType = ERequestType::KEYVALUE_EXECUTETRANSACTION, + .Database = "/Root/test", + } + }; + + for (size_t i = 0; i < 6; ++i) { + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::EXTERNAL); + } + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::OFF); + timeProvider->Advance(TDuration::MilliSeconds(1500)); + for (size_t i = 0; i < 3; ++i) { + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::EXTERNAL); + } + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::OFF); + + WaitForUpdate(runtime); // Initial update + cfg.MutableExternalThrottling(0)->SetMaxTracesPerMinute(10); + cfg.MutableExternalThrottling(0)->SetMaxTracesBurst(2); + ConfigureAndWaitUpdate(runtime, cfg, 1); + + for (size_t i = 0; i < 3; ++i) { + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::EXTERNAL); + } + auto [state, _] = controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)); + UNIT_ASSERT_EQUAL( + state, + TTracingControls::OFF); + + timeProvider->Advance(TDuration::Seconds(12)); + for (size_t i = 0; i < 2; ++i) { + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::EXTERNAL); + } + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::OFF); + + timeProvider->Advance(TDuration::Seconds(60)); + for (size_t i = 0; i < 3; ++i) { + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::EXTERNAL); + } + UNIT_ASSERT_EQUAL( + controls.HandleTracing(true, RandomChoice(executeTransactionDiscriminators)).first, + TTracingControls::OFF); + } + + Y_UNIT_TEST(RequestTypeSampler) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddSampling(); + rule->SetMaxTracesBurst(5); + rule->SetMaxTracesPerMinute(120); + rule->SetFraction(0.5); + rule->SetLevel(10); + rule->MutableScope()->AddRequestTypes()->assign("KeyValue.ExecuteTransaction"); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, {}); + UNIT_ASSERT_EQUAL(state, TTracingControls::OFF); + } + + for (size_t i = 0; i < 10; ++i) { + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, {}).first, TTracingControls::OFF); // No request type + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, {.RequestType = ERequestType::KEYVALUE_READ}).first, + TTracingControls::OFF); // Wrong request type + } + std::array executeTransactionDiscriminators{ + TRequestDiscriminator { + .RequestType = ERequestType::KEYVALUE_EXECUTETRANSACTION, + }, + TRequestDiscriminator { + .RequestType = ERequestType::KEYVALUE_EXECUTETRANSACTION, + .Database = "/Root/test", + } + }; + + { + uint64_t sampled = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(executeTransactionDiscriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + ++sampled; + UNIT_ASSERT_EQUAL(level, 10); + timeProvider->Advance(TDuration::MilliSeconds(500)); + } + } + UNIT_ASSERT(sampled >= 400 && sampled <= 600); + } + + { + uint64_t sampled = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(executeTransactionDiscriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + ++sampled; + UNIT_ASSERT_EQUAL(level, 10); + } + timeProvider->Advance(TDuration::MilliSeconds(125)); + } + UNIT_ASSERT(sampled >= 190 && sampled <= 260); + } + for (size_t i = 0; i < 50; ++i) { + controls.HandleTracing(false, RandomChoice(executeTransactionDiscriminators)); + } + for (size_t i = 0; i < 50; ++i) { + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, RandomChoice(executeTransactionDiscriminators)).first, TTracingControls::OFF); + } + timeProvider->Advance(TDuration::Seconds(10)); + + WaitForUpdate(runtime); // Initial update + { + auto& rule = *cfg.MutableSampling(0); + rule.SetMaxTracesPerMinute(10); + rule.SetMaxTracesBurst(2); + rule.SetLevel(9); + rule.SetFraction(0.25); + rule.MutableScope()->MutableRequestTypes(0)->assign("KeyValue.ReadRange"); + } + ConfigureAndWaitUpdate(runtime, cfg, 1); + + std::array readRangeDiscriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::KEYVALUE_READRANGE, + }, + TRequestDiscriminator{ + .RequestType = ERequestType::KEYVALUE_READRANGE, + .Database = "/Root/test2", + } + }; + + for (size_t i = 0; i < 20; ++i) { + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, RandomChoice(executeTransactionDiscriminators)).first, TTracingControls::OFF); + } + { + uint64_t sampled = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(readRangeDiscriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + ++sampled; + UNIT_ASSERT_EQUAL(level, 9); + } + timeProvider->Advance(TDuration::Seconds(6)); + } + UNIT_ASSERT(sampled >= 190 && sampled <= 310); + } + } + + Y_UNIT_TEST(SamplingSameScope) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddSampling(); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(120); + rule->SetFraction(0.5); + rule->SetLevel(8); + } + { + auto rule = cfg.AddSampling(); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(60); + rule->SetFraction(1. / 3); + rule->SetLevel(10); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + { + size_t level8 = 0; + size_t level10 = 0; + for (size_t i = 0; i < 1500; ++i) { + auto [state, level] = controls.HandleTracing(false, {}); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT(level == 8 || level == 10); + if (level == 8) { + ++level8; + } else { + ++level10; + } + } + timeProvider->Advance(TDuration::Seconds(1)); + } + UNIT_ASSERT(level8 >= 450 && level8 <= 570); + UNIT_ASSERT(level10 >= 450 && level10 <= 570); + } + timeProvider->Advance(TDuration::Minutes(1)); + + { + size_t level8 = 0; + size_t level10 = 0; + for (size_t i = 0; i < 1500; ++i) { + auto [state, level] = controls.HandleTracing(false, {}); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT(level == 8 || level == 10); + if (level == 8) { + ++level8; + } else { + ++level10; + } + } + timeProvider->Advance(TDuration::MilliSeconds(250)); + } + UNIT_ASSERT(level8 >= 470 && level8 <= 760); + UNIT_ASSERT(level10 >= 340 && level10 <= 385); + } + } + + Y_UNIT_TEST(ThrottlingByDb) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddExternalThrottling(); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(60); + rule->MutableScope()->MutableDatabase()->assign("/Root/db1"); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + std::array discriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/db1", + }, + TRequestDiscriminator{ + .Database = "/Root/db1", + }, + }; + + { + size_t traced = 0; + for (size_t i = 0; i < 100; ++i) { + auto [state, _] = controls.HandleTracing(true, RandomChoice(discriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::SAMPLED); + if (state == TTracingControls::EXTERNAL) { + ++traced; + } + timeProvider->Advance(TDuration::Seconds(1)); + } + UNIT_ASSERT_EQUAL(traced, 100); + + for (size_t i = 0; i < 12; ++i) { + auto [state, _] = controls.HandleTracing(true, RandomChoice(discriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::SAMPLED); + if (state == TTracingControls::EXTERNAL) { + ++traced; + } + } + UNIT_ASSERT_EQUAL(traced, 111); + } + + cfg.MutableExternalThrottling(0)->MutableScope()->AddRequestTypes()->assign("Table.ReadRows"); + WaitForUpdate(runtime); // Initial update + ConfigureAndWaitUpdate(runtime, cfg, 1); + timeProvider->Advance(TDuration::Minutes(1)); + + { + size_t traced = 0; + for (size_t i = 0; i < 12; ++i) { + auto [state, _] = controls.HandleTracing(true, discriminators[0]); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::SAMPLED); + if (state == TTracingControls::EXTERNAL) { + ++traced; + } + } + UNIT_ASSERT_EQUAL(traced, 11); + timeProvider->Advance(TDuration::Minutes(1)); + + std::array notMatchingDiscriminators{ + discriminators[1], + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_DROPTABLE, + .Database = "/Root/db1", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/db2", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + }, + TRequestDiscriminator{ + .Database = "/Root/db1", + }, + TRequestDiscriminator{}, + }; + + for (auto& discriminator : notMatchingDiscriminators) { + UNIT_ASSERT_EQUAL(controls.HandleTracing(true, discriminator).first, TTracingControls::OFF); + timeProvider->Advance(TDuration::Seconds(1)); + } + } + } + + Y_UNIT_TEST(SamplingByDb) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddSampling(); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(60); + rule->SetLevel(0); + rule->SetFraction(0.5); + rule->MutableScope()->MutableDatabase()->assign("/Root/db1"); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + std::array discriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/db1", + }, + TRequestDiscriminator{ + .Database = "/Root/db1", + }, + }; + + { + size_t sampled = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(discriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT_EQUAL(level, 0); + ++sampled; + } + timeProvider->Advance(TDuration::Seconds(1)); + } + UNIT_ASSERT(sampled >= 400 && sampled <= 600); + + } + { + size_t sampled = 0; + for (size_t i = 0; i < 60; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(discriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT_EQUAL(level, 0); + ++sampled; + } + } + UNIT_ASSERT_EQUAL(sampled, 11); + } + + cfg.MutableSampling(0)->MutableScope()->AddRequestTypes()->assign("Table.ReadRows"); + WaitForUpdate(runtime); // Initial update + ConfigureAndWaitUpdate(runtime, cfg, 1); + timeProvider->Advance(TDuration::Minutes(1)); + + { + size_t sampled = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, discriminators[0]); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT_EQUAL(level, 0); + ++sampled; + } + timeProvider->Advance(TDuration::Seconds(1)); + } + UNIT_ASSERT(sampled >= 400 && sampled <= 600); + timeProvider->Advance(TDuration::Minutes(1)); + + std::array notMatchingDiscriminators{ + discriminators[1], + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_DROPTABLE, + .Database = "/Root/db1", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/db2", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + }, + TRequestDiscriminator{ + .Database = "/Root/db1", + }, + TRequestDiscriminator{}, + }; + + for (size_t i = 0; i < 10; ++i) { + for (auto& discriminator : notMatchingDiscriminators) { + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, discriminator).first, TTracingControls::OFF); + timeProvider->Advance(TDuration::Seconds(1)); + } + } + } + } + + Y_UNIT_TEST(SharedThrottlingLimits) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddExternalThrottling(); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(60); + auto scope = rule->MutableScope(); + scope->AddRequestTypes("Table.DropTable"); + scope->AddRequestTypes("Table.ReadRows"); + scope->AddRequestTypes("Table.AlterTable"); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + std::array matchingDiscriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_DROPTABLE, + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_ALTERTABLE, + .Database = "/Root/db1", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/db2", + }, + }; + + std::array notMatchingDiscriminators{ + TRequestDiscriminator{}, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_KEEPALIVE, + }, + }; + + for (size_t i = 0; i < 21; ++i) { + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, RandomChoice(matchingDiscriminators)).first, TTracingControls::OFF); + UNIT_ASSERT_EQUAL(controls.HandleTracing(true, RandomChoice(matchingDiscriminators)).first, TTracingControls::EXTERNAL); + UNIT_ASSERT_EQUAL(controls.HandleTracing(true, RandomChoice(notMatchingDiscriminators)).first, TTracingControls::OFF); + timeProvider->Advance(TDuration::MilliSeconds(500)); + } + UNIT_ASSERT_EQUAL(controls.HandleTracing(true, RandomChoice(matchingDiscriminators)).first, TTracingControls::OFF); + } + + Y_UNIT_TEST(SharedSamplingLimits) { + TTenantTestRuntime runtime(DefaultConsoleTestConfig()); + auto timeProvider = MakeIntrusive(TInstant::Now()); + auto [controls, configurator] = CreateSamplingThrottlingConfigurator(10, timeProvider); + NKikimrConfig::TTracingConfig cfg; + { + auto rule = cfg.AddSampling(); + rule->SetMaxTracesBurst(10); + rule->SetMaxTracesPerMinute(60); + rule->SetLevel(8); + rule->SetFraction(0.5); + auto scope = rule->MutableScope(); + scope->AddRequestTypes("Table.DropTable"); + scope->AddRequestTypes("Table.ReadRows"); + scope->AddRequestTypes("Table.AlterTable"); + } + InitJaegerTracingConfigurator(runtime, std::move(configurator), cfg); + + std::array matchingDiscriminators{ + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_DROPTABLE, + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_ALTERTABLE, + .Database = "/Root/db1", + }, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_READROWS, + .Database = "/Root/db2", + }, + }; + + std::array notMatchingDiscriminators{ + TRequestDiscriminator{}, + TRequestDiscriminator{ + .RequestType = ERequestType::TABLE_KEEPALIVE, + }, + }; + + { + size_t sampled = 0; + for (size_t i = 0; i < 1000; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(matchingDiscriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT_EQUAL(level, 8); + ++sampled; + } + UNIT_ASSERT_EQUAL(controls.HandleTracing(false, RandomChoice(notMatchingDiscriminators)).first, TTracingControls::OFF); + timeProvider->Advance(TDuration::Seconds(1)); + } + UNIT_ASSERT(sampled >= 400 && sampled <= 600); + } + timeProvider->Advance(TDuration::Minutes(1)); + + { + size_t sampled = 0; + for (size_t i = 0; i < 65; ++i) { + auto [state, level] = controls.HandleTracing(false, RandomChoice(matchingDiscriminators)); + UNIT_ASSERT_UNEQUAL(state, TTracingControls::EXTERNAL); + if (state == TTracingControls::SAMPLED) { + UNIT_ASSERT_EQUAL(level, 8); + ++sampled; + } + } + UNIT_ASSERT_EQUAL(sampled, 11); + } + } + +} +} // namespace NKikimr diff --git a/ydb/core/cms/console/ut/ya.make b/ydb/core/cms/console/ut/ya.make index 2daf90fac99a..88788f1b9324 100644 --- a/ydb/core/cms/console/ut/ya.make +++ b/ydb/core/cms/console/ut/ya.make @@ -25,6 +25,7 @@ SRCS( log_settings_configurator_ut.cpp modifications_validator_ut.cpp net_classifier_updater_ut.cpp + jaeger_tracing_configurator_ut.cpp ) END() diff --git a/ydb/core/cms/console/ya.make b/ydb/core/cms/console/ya.make index 1f786284e430..80a68a361dd9 100644 --- a/ydb/core/cms/console/ya.make +++ b/ydb/core/cms/console/ya.make @@ -56,6 +56,8 @@ SRCS( http.h immediate_controls_configurator.cpp immediate_controls_configurator.h + jaeger_tracing_configurator.cpp + jaeger_tracing_configurator.h log_settings_configurator.cpp log_settings_configurator.h logger.cpp diff --git a/ydb/core/cms/json_proxy_proto.h b/ydb/core/cms/json_proxy_proto.h index 918e7c638fcb..e5bceff34fba 100644 --- a/ydb/core/cms/json_proxy_proto.h +++ b/ydb/core/cms/json_proxy_proto.h @@ -76,16 +76,6 @@ class TJsonProxyProto : public TActorBootstrapped { return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TSchemeShardControls::descriptor(), ctx); else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTCMallocControls") return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTCMallocControls::descriptor(), ctx); - else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTracingControls") - return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTracingControls::descriptor(), ctx); - else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTracingControls.TSamplingThrottlingOptions") - return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTracingControls::TSamplingThrottlingOptions::descriptor(), ctx); - else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTracingControls.TSamplingThrottlingOptions.TThrottlingOptions") - return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTracingControls::TSamplingThrottlingOptions::TThrottlingOptions::descriptor(), ctx); - else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTracingControls.TSamplingThrottlingOptions.TSamplingOptions") - return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTracingControls::TSamplingThrottlingOptions::TSamplingOptions::descriptor(), ctx); - else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTracingControls.TKeyValue") - return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTracingControls::TKeyValue::descriptor(), ctx); else if (name == ".NKikimrConfig.TImmediateControlsConfig.TTabletControls") return ReplyWithTypeDescription(*NKikimrConfig::TImmediateControlsConfig::TTabletControls::descriptor(), ctx); } diff --git a/ydb/core/control/common_controls/tracing_control.cpp b/ydb/core/control/common_controls/tracing_control.cpp deleted file mode 100644 index 536738dd289f..000000000000 --- a/ydb/core/control/common_controls/tracing_control.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "tracing_control.h" - -#include -#include -#include - -namespace NKikimr { - -namespace { - -const NKikimrConfig::TImmediateControlOptions& GetImmediateControlOptionsForField( - const google::protobuf::Descriptor& descriptor, TString fieldName) { - auto field = descriptor.FindFieldByName(fieldName); - Y_ABORT_UNLESS(field); - auto& fieldOptions = field->options(); - return fieldOptions.GetExtension(NKikimrConfig::ControlOptions); -} - -TThrottler CreateThrottler(TIntrusivePtr& icb, TIntrusivePtr timeProvider, TString domain) { - TControlWrapper maxRatePerMinute; - TControlWrapper maxBurst; - - const std::array, 2> controls = {{ - {maxRatePerMinute, "MaxRatePerMinute"}, - {maxBurst, "MaxBurst"}, - }}; - const auto& throttlingOptions = *NKikimrConfig::TImmediateControlsConfig::TTracingControls::TSamplingThrottlingOptions::TThrottlingOptions::descriptor(); - for (auto& [control, fieldName] : controls) { - const auto& controlOptions = GetImmediateControlOptionsForField(throttlingOptions, TString(fieldName)); - - control.Reset(controlOptions.GetDefaultValue(), controlOptions.GetMinValue(), controlOptions.GetMaxValue()); - icb->RegisterSharedControl(control, domain + "." + fieldName); - } - - return TThrottler(std::move(maxRatePerMinute), std::move(maxBurst), std::move(timeProvider)); -} - -} - -TTracingControl::TTracingControl(TIntrusivePtr& icb, TIntrusivePtr timeProvider, - TIntrusivePtr& randomProvider, TString controlDomain) -{ - SampledThrottler = CreateThrottler(icb, timeProvider, controlDomain + ".SampledThrottling"); - ExternalThrottler = CreateThrottler(icb, timeProvider, controlDomain + ".ExternalThrottling"); - - TControlWrapper samplingPPM; - const std::array, 2> controls = {{ - {samplingPPM, "PPM"}, - {SampledLevel, "Level"}, - }}; - - const auto& samplingOptions = *NKikimrConfig::TImmediateControlsConfig::TTracingControls::TSamplingThrottlingOptions::TSamplingOptions::descriptor(); - for (auto [control, name] : controls) { - const auto& controlOptions = GetImmediateControlOptionsForField(samplingOptions, TString(name)); - control.Reset(controlOptions.GetDefaultValue(), controlOptions.GetMinValue(), controlOptions.GetMaxValue()); - icb->RegisterSharedControl(control, controlDomain + ".Sampling." + name); - } - - Sampler = TSampler(std::move(samplingPPM), randomProvider->GenRand64()); -} - -} // namespace NKikimr diff --git a/ydb/core/control/common_controls/tracing_control.h b/ydb/core/control/common_controls/tracing_control.h deleted file mode 100644 index 56b7f45966da..000000000000 --- a/ydb/core/control/common_controls/tracing_control.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace NKikimr { - -class TTracingControl { -public: - TTracingControl(TIntrusivePtr& icb, TIntrusivePtr timeProvider, - TIntrusivePtr& randomProvider, TString controlDomain); - - bool SampleThrottle() { - return Sampler.Sample() && !SampledThrottler.Throttle(); - } - - bool ThrottleExternal() { - return ExternalThrottler.Throttle(); - } - - ui8 SampledVerbosity() const { - return SampledLevel; - } - -private: - TSampler Sampler; - TThrottler SampledThrottler; - TThrottler ExternalThrottler; - TControlWrapper SampledLevel; -}; - -} // namespace NKikimr diff --git a/ydb/core/control/common_controls/ya.make b/ydb/core/control/common_controls/ya.make deleted file mode 100644 index afc6df1f79d2..000000000000 --- a/ydb/core/control/common_controls/ya.make +++ /dev/null @@ -1,13 +0,0 @@ -LIBRARY() - -PEERDIR( - ydb/library/actors/wilson - ydb/core/protos -) - -SRCS( - tracing_control.h - tracing_control.cpp -) - -END() diff --git a/ydb/core/control/immediate_control_board_sampler.h b/ydb/core/control/immediate_control_board_sampler.h deleted file mode 100644 index e6d6784540b7..000000000000 --- a/ydb/core/control/immediate_control_board_sampler.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include - -namespace NKikimr { - -class TSampler { -public: - TSampler() : Rng(0) {} - - TSampler(TControlWrapper samplingPPM, ui64 seed) - : SamplingPPM(std::move(samplingPPM)) - , Rng(seed) - {} - - bool Sample() { - return Rng() % 1'000'000 < SamplingPPM; - } - -private: - TControlWrapper SamplingPPM; - TReallyFastRng32 Rng; -}; - -} // namespace NKikimr diff --git a/ydb/core/control/immediate_control_board_sampler_ut.cpp b/ydb/core/control/immediate_control_board_sampler_ut.cpp deleted file mode 100644 index 5df3cbe18bfd..000000000000 --- a/ydb/core/control/immediate_control_board_sampler_ut.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "immediate_control_board_sampler.h" - -#include - -namespace NKikimr { - -Y_UNIT_TEST_SUITE(SamplingControlTests) { - ui32 RunTrials(TSampler& sampler, ui32 trials) { - ui32 cnt = 0; - for (ui32 i = 0; i < trials; ++i) { - if (sampler.Sample()) { - ++cnt; - } - } - return cnt; - } - - Y_UNIT_TEST(Simple) { - TControlWrapper control(500'000, 0, 1'000'000); - TSampler sampler(control, 42); - - auto samples = RunTrials(sampler, 100'000); - UNIT_ASSERT_GE(samples, 48'000); - UNIT_ASSERT_LE(samples, 52'000); - } - - Y_UNIT_TEST(EdgeCaseLower) { - TControlWrapper control(0, 0, 1'000'000); - TSampler sampler(control, 42); - - auto samples = RunTrials(sampler, 100'000); - UNIT_ASSERT_EQUAL(samples, 0); - } - - Y_UNIT_TEST(EdgeCaseUpper) { - TControlWrapper control(1'000'000, 0, 1'000'000); - TSampler sampler(control, 42); - - auto samples = RunTrials(sampler, 100'000); - UNIT_ASSERT_EQUAL(samples, 100'000); - } - - Y_UNIT_TEST(ChangingControl) { - TControlWrapper control(250'000, 0, 1'000'000); - TSampler sampler(control, 42); - - { - auto samples = RunTrials(sampler, 100'000); - UNIT_ASSERT_GE(samples, 23'000); - UNIT_ASSERT_LE(samples, 27'000); - } - - control = 750'000; - { - auto samples = RunTrials(sampler, 100'000); - UNIT_ASSERT_GE(samples, 73'000); - UNIT_ASSERT_LE(samples, 77'000); - } - } -} - -} // namespace NKikimr diff --git a/ydb/core/control/immediate_control_board_throttler.h b/ydb/core/control/immediate_control_board_throttler.h deleted file mode 100644 index c3ee83bf0f43..000000000000 --- a/ydb/core/control/immediate_control_board_throttler.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include -#include - -namespace NKikimr { - -class TThrottler { -public: - TThrottler() = default; - - TThrottler(TControlWrapper maxRatePerMinute, TControlWrapper maxBurst, - TIntrusivePtr timeProvider) - : TimeProvider(std::move(timeProvider)) - , MaxRatePerMinute(std::move(maxRatePerMinute)) - , MaxBurst(std::move(maxBurst)) - , LastUpdate(TimeProvider->Now()) - {} - - bool Throttle() { - auto maxRatePerMinute = static_cast(MaxRatePerMinute); - auto maxBurst = static_cast(MaxBurst); - auto maxTotal = maxBurst + 1; - CurrentBurst = std::min(CurrentBurst, maxTotal); - if (maxRatePerMinute == 0) { - return true; - } - - auto now = TimeProvider->Now(); - if (now < LastUpdate) { - return true; - } - - const auto deltaBetweenSends = TDuration::Minutes(1) / maxRatePerMinute; - UpdateStats(now, deltaBetweenSends); - - if (CurrentBurst < maxTotal) { - CurrentBurst += 1; - return false; - } - - return true; - } - -private: - void UpdateStats(TInstant now, TDuration deltaBetweenSends) { - i64 decrease = (now - LastUpdate) / deltaBetweenSends; - decrease = std::min(decrease, CurrentBurst); - Y_ABORT_UNLESS(decrease >= 0); - CurrentBurst -= decrease; - LastUpdate += decrease * deltaBetweenSends; - if (CurrentBurst == 0) { - LastUpdate = now; - } - } - - TIntrusivePtr TimeProvider; - - TControlWrapper MaxRatePerMinute; - TControlWrapper MaxBurst; - - TInstant LastUpdate = TInstant::Zero(); - i64 CurrentBurst = 0; -}; - -} // namespace NKikimr diff --git a/ydb/core/control/immediate_control_board_throttler_ut.cpp b/ydb/core/control/immediate_control_board_throttler_ut.cpp deleted file mode 100644 index 3a5e40edf532..000000000000 --- a/ydb/core/control/immediate_control_board_throttler_ut.cpp +++ /dev/null @@ -1,126 +0,0 @@ -#include "immediate_control_board_throttler.h" - -#include - -namespace NKikimr { - -class TTimeProviderMock : public ITimeProvider { -public: - TTimeProviderMock(TInstant now) : CurrentTime(now) {} - - void Advance(TDuration delta) { - CurrentTime += delta; - } - - TInstant Now() final { - return CurrentTime; - } - -private: - TInstant CurrentTime; -}; - -Y_UNIT_TEST_SUITE(ThrottlerControlTests) { - void CheckAtLeast(TThrottler& throttler, ui32 n) { - for (ui32 i = 0; i < n; ++i) { - UNIT_ASSERT(!throttler.Throttle()); - } - } - - void CheckExact(TThrottler& throttler, ui32 n) { - CheckAtLeast(throttler, n); - UNIT_ASSERT(throttler.Throttle()); - } - - Y_UNIT_TEST(Simple) { - TControlWrapper maxPerMinute(6, 0, 180); - TControlWrapper maxBurst(2, 0, 180); - - auto timeProvider = MakeIntrusive(TInstant::Now()); - - TThrottler throttler(maxPerMinute, maxBurst, timeProvider); - CheckExact(throttler, 3); - CheckExact(throttler, 0); - - timeProvider->Advance(TDuration::Seconds(9)); - CheckExact(throttler, 0); - timeProvider->Advance(TDuration::Seconds(1)); - CheckExact(throttler, 1); - - timeProvider->Advance(TDuration::Seconds(15)); - CheckExact(throttler, 1); - - timeProvider->Advance(TDuration::Seconds(15)); - CheckExact(throttler, 2); - } - - Y_UNIT_TEST(LongIdle) { - TControlWrapper maxPerMinute(10, 0, 180); - TControlWrapper maxBurst(2, 0, 180); - - auto timeProvider = MakeIntrusive(TInstant::Now()); - - TThrottler throttler(maxPerMinute, maxBurst, timeProvider); - CheckAtLeast(throttler, 3); - - timeProvider->Advance(TDuration::Hours(1)); - CheckExact(throttler, 3); - } - - Y_UNIT_TEST(Overflow) { - TControlWrapper maxPerMinute(6'000, 0, 6'000); - TControlWrapper maxBurst(6'000, 0, 6'000); - - auto timeProvider = MakeIntrusive(TInstant::Now()); - - TThrottler throttler(maxPerMinute, maxBurst, timeProvider); - CheckExact(throttler, 6'001); - - timeProvider->Advance(TDuration::Days(365 * 10)); - - CheckExact(throttler, 6'001); - } - - Y_UNIT_TEST(ChangingControls) { - TControlWrapper maxPerMinute(6, 0, 180); - TControlWrapper maxBurst(2, 0, 180); - - auto timeProvider = MakeIntrusive(TInstant::Now()); - - TThrottler throttler(maxPerMinute, maxBurst, timeProvider); - CheckExact(throttler, 3); - - maxBurst = 4; - CheckExact(throttler, 2); - - maxBurst = 0; - CheckExact(throttler, 0); - - timeProvider->Advance(TDuration::Seconds(9)); - CheckExact(throttler, 0); - timeProvider->Advance(TDuration::Seconds(1)); - CheckExact(throttler, 1); - - maxPerMinute = 12 * 60; - timeProvider->Advance(TDuration::Seconds(1)); - CheckExact(throttler, 1); - - maxBurst = 20; - - timeProvider->Advance(TDuration::Seconds(3)); - CheckExact(throttler, 21); - - maxBurst = 0; - timeProvider->Advance(TDuration::Seconds(59)); - CheckAtLeast(throttler, 1); - maxPerMinute = 1; - CheckExact(throttler, 0); - timeProvider->Advance(TDuration::Minutes(1)); - CheckExact(throttler, 1); - - maxBurst = 2; - CheckExact(throttler, 2); - } -} - -} // namespace NKikimr diff --git a/ydb/core/control/ut/ya.make b/ydb/core/control/ut/ya.make index ede54f7d8c00..1e4885a42485 100644 --- a/ydb/core/control/ut/ya.make +++ b/ydb/core/control/ut/ya.make @@ -22,8 +22,6 @@ PEERDIR( SRCS( immediate_control_board_ut.cpp immediate_control_board_actor_ut.cpp - immediate_control_board_sampler_ut.cpp - immediate_control_board_throttler_ut.cpp ) END() diff --git a/ydb/core/control/ya.make b/ydb/core/control/ya.make index 4faca2369cf5..8c1c83bff961 100644 --- a/ydb/core/control/ya.make +++ b/ydb/core/control/ya.make @@ -19,16 +19,10 @@ SRCS( immediate_control_board_impl.cpp immediate_control_board_impl.h immediate_control_board_wrapper.h - immediate_control_board_throttler.h - immediate_control_board_sampler.h ) END() -RECURSE( - common_controls -) - RECURSE_FOR_TESTS( ut ) diff --git a/ydb/core/driver_lib/run/factories.h b/ydb/core/driver_lib/run/factories.h index e309fda49a0f..f22be07ab484 100644 --- a/ydb/core/driver_lib/run/factories.h +++ b/ydb/core/driver_lib/run/factories.h @@ -56,7 +56,7 @@ struct TModuleFactories { std::shared_ptr DataStreamsAuthFactory; std::vector AdditionalComputationNodeFactories; - std::unique_ptr(*WilsonGrpcSignerFactory)(const NKikimrConfig::TTracingConfig::TAuthConfig&); + std::unique_ptr(*WilsonGrpcSignerFactory)(const NKikimrConfig::TTracingConfig::TBackendConfig::TAuthConfig&); ~TModuleFactories(); }; diff --git a/ydb/core/driver_lib/run/kikimr_services_initializers.cpp b/ydb/core/driver_lib/run/kikimr_services_initializers.cpp index 0dae3013e17b..4b10b0f64dc7 100644 --- a/ydb/core/driver_lib/run/kikimr_services_initializers.cpp +++ b/ydb/core/driver_lib/run/kikimr_services_initializers.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -826,22 +827,69 @@ void TBasicServicesInitializer::InitializeServices(NActors::TActorSystemSetup* s } } - if (Config.HasTracingConfig()) { - const auto& tracing = Config.GetTracingConfig(); + if (Config.HasTracingConfig() && Config.GetTracingConfig().HasBackend()) { + const auto& tracingConfig = Config.GetTracingConfig(); + const auto& tracingBackend = tracingConfig.GetBackend(); + std::unique_ptr grpcSigner; - if (tracing.HasAuthConfig() && Factories && Factories->WilsonGrpcSignerFactory) { - grpcSigner = Factories->WilsonGrpcSignerFactory(tracing.GetAuthConfig()); + if (tracingBackend.HasAuthConfig() && Factories && Factories->WilsonGrpcSignerFactory) { + grpcSigner = Factories->WilsonGrpcSignerFactory(tracingBackend.GetAuthConfig()); + if (!grpcSigner) { + Cerr << "Failed to initialize wilson grpc signer due to misconfiguration. Config provided: " + << tracingBackend.GetAuthConfig().DebugString() << Endl; + } + } + + std::unique_ptr wilsonUploader; + switch (tracingBackend.GetBackendCase()) { + case NKikimrConfig::TTracingConfig::TBackendConfig::BackendCase::kOpentelemetry: { + const auto& opentelemetry = tracingBackend.GetOpentelemetry(); + if (!(opentelemetry.HasCollectorUrl() && opentelemetry.HasServiceName())) { + Cerr << "Both collector_url and service_name should be present in opentelemetry backend config" << Endl; + break; + } + + NWilson::TWilsonUploaderParams uploaderParams { + .CollectorUrl = opentelemetry.GetCollectorUrl(), + .ServiceName = opentelemetry.GetServiceName(), + .GrpcSigner = std::move(grpcSigner), + }; + + if (tracingConfig.HasUploader()) { + const auto& uploaderConfig = tracingConfig.GetUploader(); + +#ifdef GET_FIELD_FROM_CONFIG +#error Macro collision +#endif +#define GET_FIELD_FROM_CONFIG(field) \ + if (uploaderConfig.Has##field()) { \ + uploaderParams.field = uploaderConfig.Get##field(); \ + } + + GET_FIELD_FROM_CONFIG(MaxExportedSpansPerSecond) + GET_FIELD_FROM_CONFIG(MaxSpansInBatch) + GET_FIELD_FROM_CONFIG(MaxBytesInBatch) + GET_FIELD_FROM_CONFIG(MaxBatchAccumulationMilliseconds) + GET_FIELD_FROM_CONFIG(SpanExportTimeoutSeconds) + GET_FIELD_FROM_CONFIG(MaxExportRequestsInflight) + +#undef GET_FIELD_FROM_CONFIG + } + + wilsonUploader.reset(std::move(uploaderParams).CreateUploader()); + break; + } + + case NKikimrConfig::TTracingConfig::TBackendConfig::BackendCase::BACKEND_NOT_SET: { + Cerr << "No backend option was provided in tracing config" << Endl; + break; + } + } + if (wilsonUploader) { + setup->LocalServices.emplace_back( + NWilson::MakeWilsonUploaderId(), + TActorSetupCmd(wilsonUploader.release(), TMailboxType::ReadAsFilled, appData->BatchPoolId)); } - auto wilsonUploader = NWilson::WilsonUploaderParams { - .Host = tracing.GetHost(), - .Port = static_cast(tracing.GetPort()), - .RootCA = tracing.GetRootCA(), - .ServiceName = tracing.GetServiceName(), - .GrpcSigner = std::move(grpcSigner), - }.CreateUploader(); - setup->LocalServices.emplace_back( - NWilson::MakeWilsonUploaderId(), - TActorSetupCmd(wilsonUploader, TMailboxType::ReadAsFilled, appData->BatchPoolId)); } } @@ -1616,15 +1664,22 @@ void TGRpcServicesInitializer::InitializeServices(NActors::TActorSystemSetup* se if (!IsServiceInitialized(setup, NGRpcService::CreateGRpcRequestProxyId(0))) { const size_t proxyCount = Config.HasGRpcConfig() ? Config.GetGRpcConfig().GetGRpcProxyCount() : 1UL; + NJaegerTracing::TSamplingThrottlingConfigurator tracingConfigurator(appData->TimeProvider, appData->RandomProvider); for (size_t i = 0; i < proxyCount; ++i) { auto grpcReqProxy = Config.HasGRpcConfig() && Config.GetGRpcConfig().GetSkipSchemeCheck() ? NGRpcService::CreateGRpcRequestProxySimple(Config) - : NGRpcService::CreateGRpcRequestProxy(Config, appData->Icb); + : NGRpcService::CreateGRpcRequestProxy(Config, tracingConfigurator.GetControl()); setup->LocalServices.push_back(std::pair(NGRpcService::CreateGRpcRequestProxyId(i), TActorSetupCmd(grpcReqProxy, TMailboxType::ReadAsFilled, appData->UserPoolId))); } + setup->LocalServices.push_back(std::pair( + TActorId(), + TActorSetupCmd( + NConsole::CreateJaegerTracingConfigurator(std::move(tracingConfigurator), Config.GetTracingConfig()), + TMailboxType::ReadAsFilled, + appData->UserPoolId))); } if (!IsServiceInitialized(setup, NKesus::MakeKesusProxyServiceId())) { diff --git a/ydb/core/driver_lib/run/ya.make b/ydb/core/driver_lib/run/ya.make index 63b1a7f4cc5c..62714d224329 100644 --- a/ydb/core/driver_lib/run/ya.make +++ b/ydb/core/driver_lib/run/ya.make @@ -74,6 +74,7 @@ PEERDIR( ydb/core/grpc_services/auth_processor ydb/core/health_check ydb/core/http_proxy + ydb/core/jaeger_tracing ydb/core/kesus/proxy ydb/core/kesus/tablet ydb/core/keyvalue diff --git a/ydb/core/grpc_services/base/base.h b/ydb/core/grpc_services/base/base.h index ccf75a8c5091..e70c07eb1c38 100644 --- a/ydb/core/grpc_services/base/base.h +++ b/ydb/core/grpc_services/base/base.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -347,6 +348,7 @@ struct TRequestAuxSettings { TRateLimiterMode RlMode = TRateLimiterMode::Off; void (*CustomAttributeProcessor)(const TSchemeBoardEvents::TDescribeSchemeResult& schemeData, ICheckerIface*) = nullptr; TAuditMode AuditMode = TAuditMode::Off; + NJaegerTracing::ERequestType RequestType = NJaegerTracing::ERequestType::UNSPECIFIED; }; // grpc_request_proxy part @@ -363,12 +365,17 @@ class IRequestProxyCtx : public virtual IRequestCtxBase { virtual void ReplyUnauthenticated(const TString& msg = "") = 0; virtual void ReplyUnavaliable() = 0; - //tracing + // tracing virtual void StartTracing(NWilson::TSpan&& span) = 0; - virtual void LegacyFinishSpan() = 0; + virtual void FinishSpan() = 0; + // Returns pointer to a state that denotes whether this request ever been a subject + // to tracing decision. CAN be nullptr + virtual bool* IsTracingDecided() = 0; // Used for per-type sampling - virtual const TString& GetInternalRequestType() const = 0; + virtual NJaegerTracing::TRequestDiscriminator GetRequestDiscriminator() const { + return NJaegerTracing::TRequestDiscriminator::EMPTY; + }; // validation virtual bool Validate(TString& error) = 0; @@ -487,10 +494,9 @@ class TRefreshTokenImpl } void StartTracing(NWilson::TSpan&& /*span*/) override {} - void LegacyFinishSpan() override {} - const TString& GetInternalRequestType() const final { - static const TString empty = ""; - return empty; + void FinishSpan() override {} + bool* IsTracingDecided() override { + return nullptr; } void UpdateAuthState(NYdbGrpc::TAuthState::EAuthState state) override { @@ -893,12 +899,12 @@ class TGRpcRequestBiStreamWrapper Span_ = std::move(span); } - void LegacyFinishSpan() override { + void FinishSpan() override { Span_.End(); } - const TString& GetInternalRequestType() const final { - return TRequest::descriptor()->full_name(); + bool* IsTracingDecided() override { + return &IsTracingDecided_; } // IRequestCtxBase @@ -919,6 +925,7 @@ class TGRpcRequestBiStreamWrapper bool RlAllowed_; IGRpcProxyCounters::TPtr Counters_; NWilson::TSpan Span_; + bool IsTracingDecided_ = false; }; template @@ -1311,10 +1318,12 @@ class TGRpcRequestWrapperImpl Span_ = std::move(span); } - void LegacyFinishSpan() override {} + void FinishSpan() override { + Span_.End(); + } - const TString& GetInternalRequestType() const final { - return TRequest::descriptor()->full_name(); + bool* IsTracingDecided() override { + return &IsTracingDecided_; } void ReplyGrpcError(grpc::StatusCode code, const TString& msg, const TString& details = "") { @@ -1374,6 +1383,7 @@ class TGRpcRequestWrapperImpl TAuditLogParts AuditLogParts; TAuditLogHook AuditLogHook; bool RequestFinished = false; + bool IsTracingDecided_ = false; }; template @@ -1418,7 +1428,7 @@ class TGrpcRequestCall using TRequestIface = typename std::conditional::type; public: - static IActor* CreateRpcActor(typename std::conditional::type* msg); + static IActor* CreateRpcActor(TRequestIface* msg); static constexpr bool IsOp = IsOperation; using TBase = std::conditional_t::Value, @@ -1435,8 +1445,6 @@ class TGrpcRequestCall { } void Pass(const IFacilityProvider& facility) override { - this->Span_.End(); - try { PassMethod(std::move(std::unique_ptr(this)), facility); } catch (const std::exception& ex) { @@ -1460,6 +1468,13 @@ class TGrpcRequestCall } } + NJaegerTracing::TRequestDiscriminator GetRequestDiscriminator() const override { + return { + .RequestType = AuxSettings.RequestType, + .Database = TBase::GetDatabaseName(), + }; + } + // IRequestCtxBaseMtSafe // bool IsAuditable() const override { diff --git a/ydb/core/grpc_services/base/ya.make b/ydb/core/grpc_services/base/ya.make index 5151c40f90ba..1e94d4c16c71 100644 --- a/ydb/core/grpc_services/base/ya.make +++ b/ydb/core/grpc_services/base/ya.make @@ -11,6 +11,7 @@ PEERDIR( ydb/core/base ydb/core/grpc_services/counters ydb/core/grpc_streaming + ydb/core/jaeger_tracing ydb/public/api/protos ydb/public/sdk/cpp/client/resources ydb/library/yql/public/issue diff --git a/ydb/core/grpc_services/grpc_request_check_actor.h b/ydb/core/grpc_services/grpc_request_check_actor.h index a3e9ff044168..2e007749a916 100644 --- a/ydb/core/grpc_services/grpc_request_check_actor.h +++ b/ydb/core/grpc_services/grpc_request_check_actor.h @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -89,10 +90,11 @@ class TGrpcRequestCheckActor , Request_(std::move(request)) , Counters_(counters) , SecurityObject_(std::move(securityObject)) + , GrpcRequestBaseCtx_(Request_->Get()) , SkipCheckConnectRigths_(skipCheckConnectRigths) , FacilityProvider_(facilityProvider) + , Span_(TWilsonGrpc::RequestCheckActor, GrpcRequestBaseCtx_->GetWilsonTraceId(), "RequestCheckActor") { - GrpcRequestBaseCtx_ = Request_->Get(); TMaybe authToken = GrpcRequestBaseCtx_->GetYdbToken(); if (authToken) { TString peerName = GrpcRequestBaseCtx_->GetPeerName(); @@ -225,7 +227,8 @@ class TGrpcRequestCheckActor } void HandlePoison(TEvents::TEvPoisonPill::TPtr&) { - TBase::PassAway(); + GrpcRequestBaseCtx_->FinishSpan(); + PassAway(); } ui64 GetChannelBufferSize() const override { @@ -238,6 +241,11 @@ class TGrpcRequestCheckActor return this->RegisterWithSameMailbox(actor); } + void PassAway() override { + Span_.EndOk(); + TBase::PassAway(); + } + private: static NYql::TIssues GetRlIssues(const Ydb::RateLimiter::AcquireResourceResponse& resp) { NYql::TIssues opIssues; @@ -374,35 +382,40 @@ class TGrpcRequestCheckActor void ReplyUnauthorizedAndDie(const NYql::TIssue& issue) { GrpcRequestBaseCtx_->RaiseIssue(issue); GrpcRequestBaseCtx_->ReplyWithYdbStatus(Ydb::StatusIds::UNAUTHORIZED); - TBase::PassAway(); + GrpcRequestBaseCtx_->FinishSpan(); + PassAway(); } void ReplyUnavailableAndDie(const NYql::TIssue& issue) { GrpcRequestBaseCtx_->RaiseIssue(issue); GrpcRequestBaseCtx_->ReplyWithYdbStatus(Ydb::StatusIds::UNAVAILABLE); - TBase::PassAway(); + GrpcRequestBaseCtx_->FinishSpan(); + PassAway(); } void ReplyUnavailableAndDie(const NYql::TIssues& issue) { GrpcRequestBaseCtx_->RaiseIssues(issue); GrpcRequestBaseCtx_->ReplyWithYdbStatus(Ydb::StatusIds::UNAVAILABLE); - TBase::PassAway(); + GrpcRequestBaseCtx_->FinishSpan(); + PassAway(); } void ReplyUnauthenticatedAndDie() { GrpcRequestBaseCtx_->ReplyUnauthenticated("Unknown database"); - TBase::PassAway(); + GrpcRequestBaseCtx_->FinishSpan(); + PassAway(); } void ReplyOverloadedAndDie(const NYql::TIssue& issue) { GrpcRequestBaseCtx_->RaiseIssue(issue); GrpcRequestBaseCtx_->ReplyWithYdbStatus(Ydb::StatusIds::OVERLOADED); - TBase::PassAway(); + GrpcRequestBaseCtx_->FinishSpan(); + PassAway(); } void Continue() { if (!ValidateAndReplyOnError(GrpcRequestBaseCtx_)) { - TBase::PassAway(); + PassAway(); return; } HandleAndDie(Request_); @@ -413,8 +426,9 @@ class TGrpcRequestCheckActor // and authorization check against the database AuditRequest(GrpcRequestBaseCtx_, CheckedDatabaseName_, TBase::GetUserSID()); + GrpcRequestBaseCtx_->FinishSpan(); event->Release().Release()->Pass(*this); - TBase::PassAway(); + PassAway(); } void HandleAndDie(TAutoPtr>&) { @@ -428,14 +442,14 @@ class TGrpcRequestCheckActor template void HandleAndDie(T& event) { - GrpcRequestBaseCtx_->LegacyFinishSpan(); + GrpcRequestBaseCtx_->FinishSpan(); TGRpcRequestProxyHandleMethods::Handle(event, TlsActivationContext->AsActorContext()); - TBase::PassAway(); + PassAway(); } void ReplyBackAndDie() { TlsActivationContext->Send(Request_->Forward(Owner_)); - TBase::PassAway(); + PassAway(); } std::pair> CheckConnectRight() { @@ -512,6 +526,7 @@ class TGrpcRequestCheckActor const IFacilityProvider* FacilityProvider_; bool DmlAuditEnabled_ = false; std::unordered_set DmlAuditExpectedSubjects_; + NWilson::TSpan Span_; }; // default behavior - attributes in schema diff --git a/ydb/core/grpc_services/grpc_request_proxy.cpp b/ydb/core/grpc_services/grpc_request_proxy.cpp index ca2d67d49818..e9b9116c888e 100644 --- a/ydb/core/grpc_services/grpc_request_proxy.cpp +++ b/ydb/core/grpc_services/grpc_request_proxy.cpp @@ -8,14 +8,12 @@ #include #include #include -#include #include +#include #include #include #include -#include - namespace NKikimr { namespace NGRpcService { @@ -61,9 +59,9 @@ class TGRpcRequestProxyImpl { using TBase = TActorBootstrapped; public: - explicit TGRpcRequestProxyImpl(const NKikimrConfig::TAppConfig& appConfig, TIntrusivePtr icb) + explicit TGRpcRequestProxyImpl(const NKikimrConfig::TAppConfig& appConfig, TIntrusivePtr tracingControl) : ChannelBufferSize(appConfig.GetTableServiceConfig().GetResourceManager().GetChannelBufferSize()) - , Icb(std::move(icb)) + , TracingControl(std::move(tracingControl)) { } void Bootstrap(const TActorContext& ctx); @@ -82,8 +80,6 @@ class TGRpcRequestProxyImpl void HandleSchemeBoard(TSchemeBoardEvents::TEvNotifyDelete::TPtr& ev); void ReplayEvents(const TString& databaseName, const TActorContext& ctx); - static TString InternalRequestTypeToControlDomain(const TString& type); - TTracingControl& GetTracingControl(const TString& type); void MaybeStartTracing(IRequestProxyCtx& ctx); static bool IsAuthStateOK(const IRequestProxyCtx& ctx); @@ -92,7 +88,7 @@ class TGRpcRequestProxyImpl void Handle(TAutoPtr>& event, const TActorContext& ctx) { IRequestProxyCtx* requestBaseCtx = event->Get(); if (ValidateAndReplyOnError(requestBaseCtx)) { - requestBaseCtx->LegacyFinishSpan(); + requestBaseCtx->FinishSpan(); TGRpcRequestProxyHandleMethods::Handle(event, ctx); } } @@ -100,7 +96,7 @@ class TGRpcRequestProxyImpl void Handle(TEvListEndpointsRequest::TPtr& event, const TActorContext& ctx) { IRequestProxyCtx* requestBaseCtx = event->Get(); if (ValidateAndReplyOnError(requestBaseCtx)) { - requestBaseCtx->LegacyFinishSpan(); + requestBaseCtx->FinishSpan(); TGRpcRequestProxy::Handle(event, ctx); } } @@ -108,6 +104,7 @@ class TGRpcRequestProxyImpl void Handle(TEvProxyRuntimeEvent::TPtr& event, const TActorContext&) { IRequestProxyCtx* requestBaseCtx = event->Get(); if (ValidateAndReplyOnError(requestBaseCtx)) { + requestBaseCtx->FinishSpan(); event->Release().Release()->Pass(*this); } } @@ -139,22 +136,21 @@ class TGRpcRequestProxyImpl return true; } - template + template void PreHandle(TAutoPtr>& event, const TActorContext& ctx) { - IRequestProxyCtx* requestBaseCtx = event->Get(); - LogRequest(event); + IRequestProxyCtx* requestBaseCtx = event->Get(); if (!SchemeCache) { const TString error = "Grpc proxy is not ready to accept request, no proxy service"; LOG_ERROR_S(ctx, NKikimrServices::GRPC_SERVER, error); const auto issue = MakeIssue(NKikimrIssues::TIssuesIds::GENERIC_TXPROXY_ERROR, error); requestBaseCtx->RaiseIssue(issue); requestBaseCtx->ReplyWithYdbStatus(Ydb::StatusIds::UNAVAILABLE); + requestBaseCtx->FinishSpan(); return; } - MaybeStartTracing(*requestBaseCtx); if (IsAuthStateOK(*requestBaseCtx)) { @@ -166,6 +162,7 @@ class TGRpcRequestProxyImpl if (state.State == NYdbGrpc::TAuthState::AS_FAIL) { requestBaseCtx->ReplyUnauthenticated(); + requestBaseCtx->FinishSpan(); return; } @@ -175,6 +172,7 @@ class TGRpcRequestProxyImpl const auto issue = MakeIssue(NKikimrIssues::TIssuesIds::YDB_AUTH_UNAVAILABLE, error); requestBaseCtx->RaiseIssue(issue); requestBaseCtx->ReplyUnavaliable(); + requestBaseCtx->FinishSpan(); return; } @@ -192,6 +190,7 @@ class TGRpcRequestProxyImpl } else { if (!AllowYdbRequestsWithoutDatabase && DynamicNode) { requestBaseCtx->ReplyUnauthenticated("Requests without specified database is not allowed"); + requestBaseCtx->FinishSpan(); return; } else { databaseName = RootDatabase; @@ -202,6 +201,7 @@ class TGRpcRequestProxyImpl if (databaseName.empty()) { Counters->IncDatabaseUnavailableCounter(); requestBaseCtx->ReplyUnauthenticated("Empty database name"); + requestBaseCtx->FinishSpan(); return; } auto it = Databases.find(databaseName); @@ -216,6 +216,8 @@ class TGRpcRequestProxyImpl const auto issue = MakeIssue(NKikimrIssues::TIssuesIds::YDB_DB_NOT_READY, error); requestBaseCtx->RaiseIssue(issue); requestBaseCtx->ReplyUnavaliable(); + requestBaseCtx->FinishSpan(); + return; } return; } @@ -236,6 +238,7 @@ class TGRpcRequestProxyImpl auto issue = MakeIssue(NKikimrIssues::TIssuesIds::ACCESS_DENIED, error); requestBaseCtx->RaiseIssue(issue); requestBaseCtx->ReplyWithYdbStatus(Ydb::StatusIds::UNAUTHORIZED); + requestBaseCtx->FinishSpan(); return; } } @@ -247,6 +250,7 @@ class TGRpcRequestProxyImpl auto issue = MakeIssue(NKikimrIssues::TIssuesIds::YDB_DB_NOT_READY, "database unavailable"); requestBaseCtx->RaiseIssue(issue); requestBaseCtx->ReplyWithYdbStatus(Ydb::StatusIds::UNAVAILABLE); + requestBaseCtx->FinishSpan(); return; } @@ -255,6 +259,7 @@ class TGRpcRequestProxyImpl LOG_DEBUG(*TlsActivationContext, NKikimrServices::GRPC_SERVER, "Client was disconnected before processing request (grpc request proxy)"); requestBaseCtx->ReplyWithYdbStatus(Ydb::StatusIds::UNAVAILABLE); + requestBaseCtx->FinishSpan(); return; } @@ -272,6 +277,8 @@ class TGRpcRequestProxyImpl const auto issue = MakeIssue(NKikimrIssues::TIssuesIds::GENERIC_TXPROXY_ERROR, "Can't authenticate request"); requestBaseCtx->RaiseIssue(issue); requestBaseCtx->ReplyWithYdbStatus(Ydb::StatusIds::BAD_REQUEST); + requestBaseCtx->FinishSpan(); + return; } void ForgetDatabase(const TString& database); @@ -291,6 +298,7 @@ class TGRpcRequestProxyImpl for (auto& [_, queue] : DeferredEvents) { for (TEventReqHolder& req : queue) { req.Ctx->ReplyUnavaliable(); + req.Ctx->FinishSpan(); } } @@ -315,8 +323,7 @@ class TGRpcRequestProxyImpl bool DynamicNode = false; TString RootDatabase; IGRpcProxyCounters::TPtr Counters; - THashMap TracingControls; - TIntrusivePtr Icb; + TIntrusivePtr TracingControl; }; void TGRpcRequestProxyImpl::Bootstrap(const TActorContext& ctx) { @@ -415,51 +422,26 @@ bool TGRpcRequestProxyImpl::IsAuthStateOK(const IRequestProxyCtx& ctx) { state.NeedAuth == false && !ctx.GetYdbToken(); } -TString TGRpcRequestProxyImpl::InternalRequestTypeToControlDomain(const TString& type) { - static constexpr TStringBuf ydbNamespacePrefix = "Ydb."; - static constexpr TStringBuf requestSuffix = "Request"; - - TString controlDomain = type; - if (controlDomain.StartsWith(ydbNamespacePrefix)) { - controlDomain.erase(0, ydbNamespacePrefix.size()); - } - if (controlDomain.EndsWith(requestSuffix)) { - controlDomain.erase(controlDomain.size() - requestSuffix.size()); - } - - return controlDomain; -} - -TTracingControl& TGRpcRequestProxyImpl::GetTracingControl(const TString& type) { - if (auto it = TracingControls.find(type); it != TracingControls.end()) { - return it->second; - } - auto tracingControlsDomain = InternalRequestTypeToControlDomain(type); - auto domain = TString::Join("TracingControls.", tracingControlsDomain); - TTracingControl control(Icb, TAppData::TimeProvider, TAppData::RandomProvider, std::move(domain)); - return TracingControls.emplace(type, std::move(control)).first->second; -} - void TGRpcRequestProxyImpl::MaybeStartTracing(IRequestProxyCtx& ctx) { - auto requestType = ctx.GetInternalRequestType(); - if (requestType.empty()) { + auto isTracingDecided = ctx.IsTracingDecided(); + if (!isTracingDecided) { return; } + if (std::exchange(*isTracingDecided, true)) { + return; + } + NWilson::TTraceId traceId; if (const auto otelHeader = ctx.GetPeerMetaValues(NYdb::OTEL_TRACE_HEADER)) { - traceId = NWilson::TTraceId::FromTraceparentHeader(otelHeader.GetRef()); - } - auto& control = GetTracingControl(requestType); - if (traceId && control.ThrottleExternal()) { - LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::GRPC_SERVER, "Dropping external traceId " << traceId.GetHexTraceId() << " for request type " << requestType); - traceId = {}; - } - if (!traceId && control.SampleThrottle()) { - traceId = NWilson::TTraceId::NewTraceId(control.SampledVerbosity(), 4095); - LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::GRPC_SERVER, "Created new traceId " << traceId.GetHexTraceId() << " for request type " << requestType); + traceId = NWilson::TTraceId::FromTraceparentHeader(otelHeader.GetRef(), TComponentTracingLevels::ProductionVerbose); } + TracingControl->HandleTracing(traceId, ctx.GetRequestDiscriminator()); if (traceId) { NWilson::TSpan grpcRequestProxySpan(TWilsonGrpc::RequestProxy, std::move(traceId), "GrpcRequestProxy"); + if (auto database = ctx.GetDatabaseName()) { + grpcRequestProxySpan.Attribute("database", std::move(*database)); + } + grpcRequestProxySpan.Attribute("request_type", ctx.GetRequestName()); ctx.StartTracing(std::move(grpcRequestProxySpan)); } } @@ -522,6 +504,7 @@ void TGRpcRequestProxyImpl::ForgetDatabase(const TString& database) { while (!queue.empty()) { Counters->IncDatabaseUnavailableCounter(); queue.front().Ctx->ReplyUnauthenticated("Unknown database"); + queue.front().Ctx->FinishSpan(); queue.pop_front(); } DeferredEvents.erase(itDeferredEvents); @@ -617,8 +600,8 @@ void TGRpcRequestProxyImpl::StateFunc(TAutoPtr& ev) { } } -IActor* CreateGRpcRequestProxy(const NKikimrConfig::TAppConfig& appConfig, TIntrusivePtr icb) { - return new TGRpcRequestProxyImpl(appConfig, std::move(icb)); +IActor* CreateGRpcRequestProxy(const NKikimrConfig::TAppConfig& appConfig, TIntrusivePtr tracingControl) { + return new TGRpcRequestProxyImpl(appConfig, std::move(tracingControl)); } } // namespace NGRpcService diff --git a/ydb/core/grpc_services/grpc_request_proxy.h b/ydb/core/grpc_services/grpc_request_proxy.h index b4eedb51c5fc..c665ce1d4ddb 100644 --- a/ydb/core/grpc_services/grpc_request_proxy.h +++ b/ydb/core/grpc_services/grpc_request_proxy.h @@ -6,6 +6,7 @@ #include "grpc_request_proxy_handle_methods.h" #include +#include #include @@ -23,7 +24,7 @@ struct TAppData; namespace NGRpcService { TString DatabaseFromDomain(const TAppData* appdata); -IActor* CreateGRpcRequestProxy(const NKikimrConfig::TAppConfig& appConfig, TIntrusivePtr icb); +IActor* CreateGRpcRequestProxy(const NKikimrConfig::TAppConfig& appConfig, TIntrusivePtr tracingControl); IActor* CreateGRpcRequestProxySimple(const NKikimrConfig::TAppConfig& appConfig); class TGRpcRequestProxy : public TGRpcRequestProxyHandleMethods, public IFacilityProvider { diff --git a/ydb/core/grpc_services/rpc_begin_transaction.cpp b/ydb/core/grpc_services/rpc_begin_transaction.cpp index 374593f22914..2cb6ad321660 100644 --- a/ydb/core/grpc_services/rpc_begin_transaction.cpp +++ b/ydb/core/grpc_services/rpc_begin_transaction.cpp @@ -87,7 +87,7 @@ class TBeginTransactionRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetAction(NKikimrKqp::QUERY_ACTION_BEGIN_TX); ev->Record.MutableRequest()->MutableTxControl()->mutable_begin_tx()->CopyFrom(req->tx_settings()); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_calls.h b/ydb/core/grpc_services/rpc_calls.h index 1f92eb4f6b56..027243062acf 100644 --- a/ydb/core/grpc_services/rpc_calls.h +++ b/ydb/core/grpc_services/rpc_calls.h @@ -50,6 +50,7 @@ inline bool ValidateAndReplyOnError(IRequestProxyCtx* ctx) { const auto issue = MakeIssue(NKikimrIssues::TIssuesIds::YDB_API_VALIDATION_ERROR, validationError); ctx->RaiseIssue(issue); ctx->ReplyWithYdbStatus(Ydb::StatusIds::BAD_REQUEST); + ctx->FinishSpan(); return false; } else { return true; diff --git a/ydb/core/grpc_services/rpc_commit_transaction.cpp b/ydb/core/grpc_services/rpc_commit_transaction.cpp index d06731dd0edb..d555933d5091 100644 --- a/ydb/core/grpc_services/rpc_commit_transaction.cpp +++ b/ydb/core/grpc_services/rpc_commit_transaction.cpp @@ -75,7 +75,7 @@ class TCommitTransactionRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetStatsMode(GetKqpStatsMode(req->collect_stats())); ev->Record.MutableRequest()->SetCollectStats(req->collect_stats()); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_common/rpc_common_kqp_session.cpp b/ydb/core/grpc_services/rpc_common/rpc_common_kqp_session.cpp index 041b1c9e1f9a..c99c5af2a273 100644 --- a/ydb/core/grpc_services/rpc_common/rpc_common_kqp_session.cpp +++ b/ydb/core/grpc_services/rpc_common/rpc_common_kqp_session.cpp @@ -7,7 +7,9 @@ #include "rpc_common.h" #include +#include +#include #include #include #include @@ -33,7 +35,9 @@ using TEvDeleteSessionQueryRequest = TGrpcRequestOperationCall { public: TCreateSessionRPC(IRequestCtx* msg) - : Request(msg) {} + : Request(msg) + , Span(TWilsonGrpc::RequestActor, msg->GetWilsonTraceId(), "CreateSessionRpcActor") + {} void Bootstrap(const TActorContext&) { Become(&TCreateSessionRPC::StateWork); @@ -77,7 +81,7 @@ class TCreateSessionRPC : public TActorBootstrapped { SetDatabase(ev, *Request); - Send(NKqp::MakeKqpProxyID(SelfId().NodeId()), ev.Release()); + Send(NKqp::MakeKqpProxyID(SelfId().NodeId()), ev.Release(), 0, 0, Span.GetTraceId()); } void StateWork(TAutoPtr& ev) { @@ -88,6 +92,7 @@ class TCreateSessionRPC : public TActorBootstrapped { } void Handle(TEvents::TEvWakeup::TPtr&) { + Span.Event("client_lost", {}); ClientLost = true; } @@ -112,7 +117,9 @@ class TCreateSessionRPC : public TActorBootstrapped { void Handle(NKqp::TEvKqp::TEvCreateSessionResponse::TPtr& ev, const TActorContext& ctx) { const auto& record = ev->Get()->Record; if (record.GetResourceExhausted()) { - Request->ReplyWithRpcStatus(grpc::StatusCode::RESOURCE_EXHAUSTED, record.GetError()); + auto responseCode = grpc::StatusCode::RESOURCE_EXHAUSTED; + Request->ReplyWithRpcStatus(responseCode, record.GetError()); + Span.EndError("Resource exhausted"); Die(ctx); return; } @@ -125,6 +132,7 @@ class TCreateSessionRPC : public TActorBootstrapped { Reply(Ydb::StatusIds::INTERNAL_ERROR); } else { SendSessionResult(kqpResponse); + Span.EndOk(); PassAway(); return; } @@ -146,16 +154,19 @@ class TCreateSessionRPC : public TActorBootstrapped { void Reply(Ydb::StatusIds::StatusCode status) { Request->ReplyWithYdbStatus(status); + NWilson::EndSpanWithStatus(Span, status); this->PassAway(); } void Reply(Ydb::StatusIds::StatusCode status, NProtoBuf::Message* resp) { Request->Reply(resp, status); + NWilson::EndSpanWithStatus(Span, status); this->PassAway(); } protected: std::shared_ptr Request; + NWilson::TSpan Span; private: bool ClientLost = false; @@ -202,7 +213,9 @@ class TCreateSessionQueryService : public TCreateSessionRPC { class TDeleteSessionRPC : public TActorBootstrapped { public: TDeleteSessionRPC(IRequestCtx* msg) - : Request(msg) {} + : Request(msg) + , Span(TWilsonGrpc::RequestActor, msg->GetWilsonTraceId(), "DeleteSessionRpcActor") + {} void Bootstrap(const TActorContext&) { DeleteSessionImpl(); @@ -220,12 +233,13 @@ class TDeleteSessionRPC : public TActorBootstrapped { return Reply(Ydb::StatusIds::BAD_REQUEST); } - Send(NKqp::MakeKqpProxyID(SelfId().NodeId()), ev.Release()); //no respose will be sended, so don't wait for anything + Send(NKqp::MakeKqpProxyID(SelfId().NodeId()), ev.Release(), 0, 0, Span.GetTraceId()); //no respose will be sended, so don't wait for anything Reply(Ydb::StatusIds::SUCCESS); } void Reply(Ydb::StatusIds::StatusCode status) { Request->ReplyWithYdbStatus(status); + NWilson::EndSpanWithStatus(Span, status); this->PassAway(); } @@ -233,6 +247,7 @@ class TDeleteSessionRPC : public TActorBootstrapped { protected: std::shared_ptr Request; + NWilson::TSpan Span; }; class TDeleteSessionTableService : public TDeleteSessionRPC { diff --git a/ydb/core/grpc_services/rpc_deferrable.h b/ydb/core/grpc_services/rpc_deferrable.h index 93550e2b11ef..11652e54322c 100644 --- a/ydb/core/grpc_services/rpc_deferrable.h +++ b/ydb/core/grpc_services/rpc_deferrable.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -202,12 +203,14 @@ class TRpcOperationRequestActor : public TRpcRequestWithOperationParamsActor& message, const TActorContext& ctx) { Request_->SendResult(status, message); + NWilson::EndSpanWithStatus(Span_, status); this->Die(ctx); } void Reply(Ydb::StatusIds::StatusCode status, const NYql::TIssues& issues, const TActorContext& ctx) { Request_->RaiseIssues(issues); Request_->ReplyWithYdbStatus(status); + NWilson::EndSpanWithStatus(Span_, status); this->Die(ctx); } @@ -219,6 +222,7 @@ class TRpcOperationRequestActor : public TRpcRequestWithOperationParamsActorReplyWithYdbStatus(status); + NWilson::EndSpanWithStatus(Span_, status); this->Die(ctx); } @@ -226,6 +230,7 @@ class TRpcOperationRequestActor : public TRpcRequestWithOperationParamsActor& message, const TActorContext &ctx) { Request_->SendResult(status, message); + NWilson::EndSpanWithStatus(Span_, status); this->Die(ctx); } @@ -236,6 +241,7 @@ class TRpcOperationRequestActor : public TRpcRequestWithOperationParamsActorSendResult(result, status, message); + NWilson::EndSpanWithStatus(Span_, status); this->Die(ctx); } @@ -244,12 +250,14 @@ class TRpcOperationRequestActor : public TRpcRequestWithOperationParamsActorSendResult(result, status); + NWilson::EndSpanWithStatus(Span_, status); this->Die(ctx); } void ReplyOperation(Ydb::Operations::Operation& operation) { Request_->SendOperation(operation); + NWilson::EndSpanWithStatus(Span_, operation.status()); this->PassAway(); } diff --git a/ydb/core/grpc_services/rpc_discovery.cpp b/ydb/core/grpc_services/rpc_discovery.cpp index 2bc5cc74a46f..84c26fe124fe 100644 --- a/ydb/core/grpc_services/rpc_discovery.cpp +++ b/ydb/core/grpc_services/rpc_discovery.cpp @@ -29,6 +29,8 @@ class TListEndpointsRPC : public TActorBootstrapped { THolder LookupResponse; THolder NameserviceResponse; + NWilson::TSpan Span; + public: static constexpr NKikimrServices::TActivity::EType ActorActivityType() { return NKikimrServices::TActivity::GRPC_REQ; @@ -37,6 +39,7 @@ class TListEndpointsRPC : public TActorBootstrapped { TListEndpointsRPC(TEvListEndpointsRequest::TPtr &msg, TActorId cacheId) : Request(msg->Release().Release()) , CacheId(cacheId) + , Span(TWilsonGrpc::RequestActor, Request->GetWilsonTraceId(), "ListEndpointsRpc") {} void Bootstrap() { @@ -54,6 +57,7 @@ class TListEndpointsRPC : public TActorBootstrapped { if (Discoverer) { Send(Discoverer, new TEvents::TEvPoisonPill()); } + Span.EndOk(); TActorBootstrapped::PassAway(); } diff --git a/ydb/core/grpc_services/rpc_execute_data_query.cpp b/ydb/core/grpc_services/rpc_execute_data_query.cpp index b96c36b1f7d6..7613a26ed444 100644 --- a/ydb/core/grpc_services/rpc_execute_data_query.cpp +++ b/ydb/core/grpc_services/rpc_execute_data_query.cpp @@ -146,7 +146,7 @@ class TExecuteDataQueryRPC : public TRpcKqpRequestActoroperation_params().report_cost_info() == Ydb::FeatureFlag::ENABLED; - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } static void ConvertReadStats(const NKikimrQueryStats::TReadOpStats& from, Ydb::TableStats::OperationStats* to) { diff --git a/ydb/core/grpc_services/rpc_execute_scheme_query.cpp b/ydb/core/grpc_services/rpc_execute_scheme_query.cpp index 5625e318d1e1..00a12287f49d 100644 --- a/ydb/core/grpc_services/rpc_execute_scheme_query.cpp +++ b/ydb/core/grpc_services/rpc_execute_scheme_query.cpp @@ -73,7 +73,7 @@ class TExecuteSchemeQueryRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetType(NKikimrKqp::QUERY_TYPE_SQL_DDL); ev->Record.MutableRequest()->SetQuery(req->yql_text()); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_execute_yql_script.cpp b/ydb/core/grpc_services/rpc_execute_yql_script.cpp index 49db992aed7a..8f1a23be8a7d 100644 --- a/ydb/core/grpc_services/rpc_execute_yql_script.cpp +++ b/ydb/core/grpc_services/rpc_execute_yql_script.cpp @@ -76,7 +76,7 @@ class TExecuteYqlScriptRPC : public TRpcKqpRequestActorsyntax() ); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_explain_data_query.cpp b/ydb/core/grpc_services/rpc_explain_data_query.cpp index 129ac5468bb9..92202713eda6 100644 --- a/ydb/core/grpc_services/rpc_explain_data_query.cpp +++ b/ydb/core/grpc_services/rpc_explain_data_query.cpp @@ -70,7 +70,7 @@ class TExplainDataQueryRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetQuery(req->yql_text()); ev->Record.MutableRequest()->SetCollectDiagnostics(req->Getcollect_full_diagnostics()); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_explain_yql_script.cpp b/ydb/core/grpc_services/rpc_explain_yql_script.cpp index 8494ad758ff8..884ad953fab1 100644 --- a/ydb/core/grpc_services/rpc_explain_yql_script.cpp +++ b/ydb/core/grpc_services/rpc_explain_yql_script.cpp @@ -77,7 +77,7 @@ class TExplainYqlScriptRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetQuery(script); ev->Record.MutableRequest()->SetKeepSession(false); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_load_rows.cpp b/ydb/core/grpc_services/rpc_load_rows.cpp index 64e359be13ef..05cc85065298 100644 --- a/ydb/core/grpc_services/rpc_load_rows.cpp +++ b/ydb/core/grpc_services/rpc_load_rows.cpp @@ -117,8 +117,9 @@ const Ydb::Table::BulkUpsertRequest* GetProtoRequest(IRequestOpCtx* req) { class TUploadRowsRPCPublic : public NTxProxy::TUploadRowsBase { using TBase = NTxProxy::TUploadRowsBase; public: - explicit TUploadRowsRPCPublic(IRequestOpCtx* request, bool diskQuotaExceeded) - : TBase(GetDuration(GetProtoRequest(request)->operation_params().operation_timeout()), diskQuotaExceeded) + explicit TUploadRowsRPCPublic(IRequestOpCtx* request, bool diskQuotaExceeded, const char* name) + : TBase(GetDuration(GetProtoRequest(request)->operation_params().operation_timeout()), diskQuotaExceeded, + NWilson::TSpan(TWilsonKqp::BulkUpsertActor, request->GetWilsonTraceId(), name)) , Request(request) {} @@ -517,7 +518,7 @@ void DoBulkUpsertRequest(std::unique_ptr p, const IFacilityProvid } else if (GetProtoRequest(p.get())->has_csv_settings()) { f.RegisterActor(new TUploadColumnsRPCPublic(p.release(), diskQuotaExceeded)); } else { - f.RegisterActor(new TUploadRowsRPCPublic(p.release(), diskQuotaExceeded)); + f.RegisterActor(new TUploadRowsRPCPublic(p.release(), diskQuotaExceeded, "BulkRowsUpsertActor")); } } @@ -530,7 +531,7 @@ IActor* TEvBulkUpsertRequest::CreateRpcActor(NKikimr::NGRpcService::IRequestOpCt } else if (GetProtoRequest(msg)->has_csv_settings()) { return new TUploadColumnsRPCPublic(msg, diskQuotaExceeded); } else { - return new TUploadRowsRPCPublic(msg, diskQuotaExceeded); + return new TUploadRowsRPCPublic(msg, diskQuotaExceeded, "BulkRowsUpsertActor"); } } diff --git a/ydb/core/grpc_services/rpc_prepare_data_query.cpp b/ydb/core/grpc_services/rpc_prepare_data_query.cpp index a18487cae8b5..673ee5a87f1d 100644 --- a/ydb/core/grpc_services/rpc_prepare_data_query.cpp +++ b/ydb/core/grpc_services/rpc_prepare_data_query.cpp @@ -79,7 +79,7 @@ class TPrepareDataQueryRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetType(NKikimrKqp::QUERY_TYPE_SQL_DML); ev->Record.MutableRequest()->SetQuery(req->yql_text()); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/rpc_read_rows.cpp b/ydb/core/grpc_services/rpc_read_rows.cpp index 32465ee207a2..2769ff5a641f 100644 --- a/ydb/core/grpc_services/rpc_read_rows.cpp +++ b/ydb/core/grpc_services/rpc_read_rows.cpp @@ -85,6 +85,7 @@ class TReadRowsRPC : public TActorBootstrapped { explicit TReadRowsRPC(std::unique_ptr request) : Request(std::move(request)) , PipeCache(MakePipePeNodeCacheID(true)) + , Span(TWilsonGrpc::RequestActor, Request->GetWilsonTraceId(), "ReadRowsRpc") {} bool BuildSchema(NSchemeCache::TSchemeCacheNavigate* resolveNamesResult, TString& errorMessage) { @@ -355,7 +356,7 @@ class TReadRowsRPC : public TActorBootstrapped { entry.ShowPrivatePath = false; auto request = std::make_unique(); request->ResultSet.emplace_back(entry); - Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvNavigateKeySet(request.release())); + Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvNavigateKeySet(request.release()), 0, 0, Span.GetTraceId()); return true; } @@ -439,7 +440,7 @@ class TReadRowsRPC : public TActorBootstrapped { auto request = std::make_unique(); request->ResultSet.emplace_back(std::move(keyRange)); - Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvResolveKeySet(request.release())); + Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvResolveKeySet(request.release()), 0, 0, Span.GetTraceId()); } void CreateShardToKeysMapping(TKeyDesc* keyRange) { @@ -495,7 +496,7 @@ class TReadRowsRPC : public TActorBootstrapped { } LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::RPC_REQUEST, "TReadRowsRPC send TEvRead shardId : " << shardId << " keys.size(): " << keys.size()); - Send(PipeCache, new TEvPipeCache::TEvForward(request.release(), shardId, true), IEventHandle::FlagTrackDelivery); + Send(PipeCache, new TEvPipeCache::TEvForward(request.release(), shardId, true), IEventHandle::FlagTrackDelivery, 0, Span.GetTraceId()); ++ReadsInFlight; } @@ -672,6 +673,7 @@ class TReadRowsRPC : public TActorBootstrapped { if (TimeoutTimerActorId) { Send(TimeoutTimerActorId, new TEvents::TEvPoisonPill()); } + Span.EndOk(); TBase::PassAway(); } @@ -724,6 +726,8 @@ class TReadRowsRPC : public TActorBootstrapped { ui64 Retries = 0; const ui64 MaxTotalRetries = 5; + + NWilson::TSpan Span; }; void DoReadRowsRequest(std::unique_ptr p, const IFacilityProvider& f) { diff --git a/ydb/core/grpc_services/rpc_rollback_transaction.cpp b/ydb/core/grpc_services/rpc_rollback_transaction.cpp index b1d4aaaddec9..9d17fa77dd0e 100644 --- a/ydb/core/grpc_services/rpc_rollback_transaction.cpp +++ b/ydb/core/grpc_services/rpc_rollback_transaction.cpp @@ -70,7 +70,7 @@ class TRollbackTransactionRPC : public TRpcKqpRequestActorRecord.MutableRequest()->SetAction(NKikimrKqp::QUERY_ACTION_ROLLBACK_TX); ev->Record.MutableRequest()->MutableTxControl()->set_tx_id(req->tx_id()); - ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release()); + ctx.Send(NKqp::MakeKqpProxyID(ctx.SelfID.NodeId()), ev.Release(), 0, 0, Span_.GetTraceId()); } void Handle(NKqp::TEvKqp::TEvQueryResponse::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/grpc_services/ya.make b/ydb/core/grpc_services/ya.make index c3fd88321a38..4e6273649960 100644 --- a/ydb/core/grpc_services/ya.make +++ b/ydb/core/grpc_services/ya.make @@ -115,6 +115,7 @@ PEERDIR( ydb/core/tx/sharding ydb/core/tx/long_tx_service/public ydb/core/tx/data_events + ydb/core/util ydb/core/ydb_convert ydb/core/security ydb/library/aclib diff --git a/ydb/core/jaeger_tracing/request_discriminator.cpp b/ydb/core/jaeger_tracing/request_discriminator.cpp new file mode 100644 index 000000000000..79a647f89cd5 --- /dev/null +++ b/ydb/core/jaeger_tracing/request_discriminator.cpp @@ -0,0 +1,10 @@ +#include "request_discriminator.h" + +namespace NKikimr::NJaegerTracing { + +const TRequestDiscriminator TRequestDiscriminator::EMPTY { + .RequestType = ERequestType::UNSPECIFIED, + .Database = NothingObject, +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/request_discriminator.h b/ydb/core/jaeger_tracing/request_discriminator.h new file mode 100644 index 000000000000..48e1284bef91 --- /dev/null +++ b/ydb/core/jaeger_tracing/request_discriminator.h @@ -0,0 +1,127 @@ +#pragma once + +#include + +#include +#include +#include +#include + +namespace NKikimr::NJaegerTracing { + +enum class ERequestType: size_t { + UNSPECIFIED, + + KEYVALUE_CREATEVOLUME, + KEYVALUE_DROPVOLUME, + KEYVALUE_ALTERVOLUME, + KEYVALUE_DESCRIBEVOLUME, + KEYVALUE_LISTLOCALPARTITIONS, + KEYVALUE_ACQUIRELOCK, + KEYVALUE_EXECUTETRANSACTION, + KEYVALUE_READ, + KEYVALUE_READRANGE, + KEYVALUE_LISTRANGE, + KEYVALUE_GETSTORAGECHANNELSTATUS, + + TABLE_CREATESESSION, + TABLE_KEEPALIVE, + TABLE_ALTERTABLE, + TABLE_CREATETABLE, + TABLE_DROPTABLE, + TABLE_DESCRIBETABLE, + TABLE_COPYTABLE, + TABLE_COPYTABLES, + TABLE_RENAMETABLES, + TABLE_EXPLAINDATAQUERY, + TABLE_EXECUTESCHEMEQUERY, + TABLE_BEGINTRANSACTION, + TABLE_DESCRIBETABLEOPTIONS, + TABLE_DELETESESSION, + TABLE_COMMITTRANSACTION, + TABLE_ROLLBACKTRANSACTION, + TABLE_PREPAREDATAQUERY, + TABLE_EXECUTEDATAQUERY, + TABLE_BULKUPSERT, + TABLE_STREAMEXECUTESCANQUERY, + TABLE_STREAMREADTABLE, + TABLE_READROWS, + + QUERY_EXECUTEQUERY, + QUERY_EXECUTESCRIPT, + QUERY_FETCHSCRIPTRESULTS, + QUERY_CREATESESSION, + QUERY_DELETESESSION, + QUERY_ATTACHSESSION, + QUERY_BEGINTRANSACTION, + QUERY_COMMITTRANSACTION, + QUERY_ROLLBACKTRANSACTION, + + DISCOVERY_WHOAMI, + DISCOVERY_NODEREGISTRATION, + DISCOVERY_LISTENDPOINTS, + + REQUEST_TYPES_CNT, // Add new types above this line +}; + +static constexpr size_t kRequestTypesCnt = static_cast(ERequestType::REQUEST_TYPES_CNT); + +static const THashMap NameToRequestType = { + {"KeyValue.CreateVolume", ERequestType::KEYVALUE_CREATEVOLUME}, + {"KeyValue.DropVolume", ERequestType::KEYVALUE_DROPVOLUME}, + {"KeyValue.AlterVolume", ERequestType::KEYVALUE_ALTERVOLUME}, + {"KeyValue.DescribeVolume", ERequestType::KEYVALUE_DESCRIBEVOLUME}, + {"KeyValue.ListLocalPartitions", ERequestType::KEYVALUE_LISTLOCALPARTITIONS}, + {"KeyValue.AcquireLock", ERequestType::KEYVALUE_ACQUIRELOCK}, + {"KeyValue.ExecuteTransaction", ERequestType::KEYVALUE_EXECUTETRANSACTION}, + {"KeyValue.Read", ERequestType::KEYVALUE_READ}, + {"KeyValue.ReadRange", ERequestType::KEYVALUE_READRANGE}, + {"KeyValue.ListRange", ERequestType::KEYVALUE_LISTRANGE}, + {"KeyValue.GetStorageChannelStatus", ERequestType::KEYVALUE_GETSTORAGECHANNELSTATUS}, + + {"Table.CreateSession", ERequestType::TABLE_CREATESESSION}, + {"Table.KeepAlive", ERequestType::TABLE_KEEPALIVE}, + {"Table.AlterTable", ERequestType::TABLE_ALTERTABLE}, + {"Table.CreateTable", ERequestType::TABLE_CREATETABLE}, + {"Table.DropTable", ERequestType::TABLE_DROPTABLE}, + {"Table.DescribeTable", ERequestType::TABLE_DESCRIBETABLE}, + {"Table.CopyTable", ERequestType::TABLE_COPYTABLE}, + {"Table.CopyTables", ERequestType::TABLE_COPYTABLES}, + {"Table.RenameTables", ERequestType::TABLE_RENAMETABLES}, + {"Table.ExplainDataQuery", ERequestType::TABLE_EXPLAINDATAQUERY}, + {"Table.ExecuteSchemeQuery", ERequestType::TABLE_EXECUTESCHEMEQUERY}, + {"Table.BeginTransaction", ERequestType::TABLE_BEGINTRANSACTION}, + {"Table.DescribeTableOptions", ERequestType::TABLE_DESCRIBETABLEOPTIONS}, + {"Table.DeleteSession", ERequestType::TABLE_DELETESESSION}, + {"Table.CommitTransaction", ERequestType::TABLE_COMMITTRANSACTION}, + {"Table.RollbackTransaction", ERequestType::TABLE_ROLLBACKTRANSACTION}, + {"Table.PrepareDataQuery", ERequestType::TABLE_PREPAREDATAQUERY}, + {"Table.ExecuteDataQuery", ERequestType::TABLE_EXECUTEDATAQUERY}, + {"Table.BulkUpsert", ERequestType::TABLE_BULKUPSERT}, + {"Table.StreamExecuteScanQuery", ERequestType::TABLE_STREAMEXECUTESCANQUERY}, + {"Table.StreamReadTable", ERequestType::TABLE_STREAMREADTABLE}, + {"Table.ReadRows", ERequestType::TABLE_READROWS}, + + {"Query.ExecuteQuery", ERequestType::QUERY_EXECUTEQUERY}, + {"Query.ExecuteScript", ERequestType::QUERY_EXECUTESCRIPT}, + {"Query.FetchScriptResults", ERequestType::QUERY_FETCHSCRIPTRESULTS}, + {"Query.CreateSession", ERequestType::QUERY_CREATESESSION}, + {"Query.DeleteSession", ERequestType::QUERY_DELETESESSION}, + {"Query.AttachSession", ERequestType::QUERY_ATTACHSESSION}, + {"Query.BeginTransaction", ERequestType::QUERY_BEGINTRANSACTION}, + {"Query.CommitTransaction", ERequestType::QUERY_COMMITTRANSACTION}, + {"Query.RollbackTransaction", ERequestType::QUERY_ROLLBACKTRANSACTION}, + + {"Discovery.WhoAmI", ERequestType::DISCOVERY_WHOAMI}, + {"Discovery.NodeRegistration", ERequestType::DISCOVERY_NODEREGISTRATION}, + {"Discovery.ListEndpoints", ERequestType::DISCOVERY_LISTENDPOINTS}, +}; + +struct TRequestDiscriminator { + ERequestType RequestType = ERequestType::UNSPECIFIED; + TMaybe Database = NothingObject; + + static const TRequestDiscriminator EMPTY; +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampler.h b/ydb/core/jaeger_tracing/sampler.h new file mode 100644 index 000000000000..1adaed88f2b5 --- /dev/null +++ b/ydb/core/jaeger_tracing/sampler.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace NKikimr::NJaegerTracing { + +class TSampler { +public: + TSampler(double fraction, ui64 seed) + : SamplingFraction(fraction) + , Rng(seed) + {} + + bool Sample() { + return Rng.GenRandReal1() < SamplingFraction; + } + +private: + const double SamplingFraction; + TFastRng64 Rng; +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampler_ut.cpp b/ydb/core/jaeger_tracing/sampler_ut.cpp new file mode 100644 index 000000000000..2d1cd587b462 --- /dev/null +++ b/ydb/core/jaeger_tracing/sampler_ut.cpp @@ -0,0 +1,41 @@ +#include "sampler.h" + +#include + +namespace NKikimr::NJaegerTracing { + +Y_UNIT_TEST_SUITE(SamplingControlTests) { + ui32 RunTrials(TSampler& sampler, ui32 trials) { + ui32 cnt = 0; + for (ui32 i = 0; i < trials; ++i) { + if (sampler.Sample()) { + ++cnt; + } + } + return cnt; + } + + Y_UNIT_TEST(Simple) { + TSampler sampler(0.5, 42); + + auto samples = RunTrials(sampler, 100'000); + UNIT_ASSERT_GE(samples, 48'000); + UNIT_ASSERT_LE(samples, 52'000); + } + + Y_UNIT_TEST(EdgeCaseLower) { + TSampler sampler(0, 42); + + auto samples = RunTrials(sampler, 100'000); + UNIT_ASSERT_EQUAL(samples, 0); + } + + Y_UNIT_TEST(EdgeCaseUpper) { + TSampler sampler(1, 42); + + auto samples = RunTrials(sampler, 100'000); + UNIT_ASSERT_EQUAL(samples, 100'000); + } +} + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampling_throttling_configurator.cpp b/ydb/core/jaeger_tracing/sampling_throttling_configurator.cpp new file mode 100644 index 000000000000..4a22d79bda29 --- /dev/null +++ b/ydb/core/jaeger_tracing/sampling_throttling_configurator.cpp @@ -0,0 +1,84 @@ +#include "sampling_throttling_configurator.h" + +#include "sampling_throttling_control.h" +#include "sampling_throttling_control_internals.h" + +#include +#include +#include + +namespace NKikimr::NJaegerTracing { + +namespace { + +template +void PropagateUnspecifiedRequest(TRulesContainer& rules) { + constexpr auto unspecifiedRequestType = static_cast(ERequestType::UNSPECIFIED); + const auto& unspecifiedRequestTypeRules = rules[unspecifiedRequestType]; + + for (size_t requestType = 0; requestType < kRequestTypesCnt; ++requestType) { + if (requestType == unspecifiedRequestType) { + continue; + } + + auto& requestTypeDatabaseRules = rules[requestType].DatabaseRules; + auto& requestTypeGlobalRules = rules[requestType].Global; + for (const auto& [database, unspecifiedDatabaseRules] : unspecifiedRequestTypeRules.DatabaseRules) { + auto& databaseRules = requestTypeDatabaseRules[database]; + databaseRules.insert(databaseRules.end(), unspecifiedDatabaseRules.begin(), + unspecifiedDatabaseRules.end()); + } + requestTypeGlobalRules.insert(requestTypeGlobalRules.end(), + unspecifiedRequestTypeRules.Global.begin(), + unspecifiedRequestTypeRules.Global.end()); + } +} + +} // namespace anonymous + +TSamplingThrottlingConfigurator::TSamplingThrottlingConfigurator(TIntrusivePtr timeProvider, + TIntrusivePtr& randomProvider) + : TimeProvider(std::move(timeProvider)) + , Rng(randomProvider->GenRand64()) + , CurrentSettings(GenerateThrottlers({})) +{} + +TIntrusivePtr TSamplingThrottlingConfigurator::GetControl() { + auto control = TIntrusivePtr(new TSamplingThrottlingControl(GenerateSetup())); + IssuedControls.push_back(control); + return control; +} + +void TSamplingThrottlingConfigurator::UpdateSettings(TSettings> settings) { + auto enrichedSettings = GenerateThrottlers(std::move(settings)); + PropagateUnspecifiedRequest(enrichedSettings.SamplingRules); + PropagateUnspecifiedRequest(enrichedSettings.ExternalThrottlingRules); + CurrentSettings = std::move(enrichedSettings); + + for (auto& control : IssuedControls) { + control->UpdateImpl(GenerateSetup()); + } +} + +TSettings> TSamplingThrottlingConfigurator::GenerateThrottlers( + TSettings> settings) { + THashMap> throttlers; + return settings.MapThrottler([this, &throttlers](const TWithTag& settings) { + if (auto it = throttlers.FindPtr(settings.Tag)) { + return *it; + } + auto throttler = MakeIntrusive(settings.Value.MaxTracesPerMinute, settings.Value.MaxTracesBurst, TimeProvider); + throttlers[settings.Tag] = throttler; + return throttler; + }); +} + +std::unique_ptr TSamplingThrottlingConfigurator::GenerateSetup() { + auto setup = CurrentSettings.MapSampler([this](double fraction) { + return TSampler(fraction, Rng()); + }); + + return std::make_unique(std::move(setup)); +} + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampling_throttling_configurator.h b/ydb/core/jaeger_tracing/sampling_throttling_configurator.h new file mode 100644 index 000000000000..642c37e0d16f --- /dev/null +++ b/ydb/core/jaeger_tracing/sampling_throttling_configurator.h @@ -0,0 +1,46 @@ +#pragma once + +#include "sampling_throttling_control.h" + +#include "throttler.h" +#include "settings.h" + +#include + +#include +#include + +#include +#include + +namespace NKikimr::NJaegerTracing { + +// Used to represent shared limits in throttlers and samplers +template +struct TWithTag { + T Value; + size_t Tag; +}; + +class TSamplingThrottlingConfigurator: private TMoveOnly { +public: + TSamplingThrottlingConfigurator(TIntrusivePtr timeProvider, + TIntrusivePtr& randomProvider); + + TIntrusivePtr GetControl(); + + void UpdateSettings(TSettings> settings); + +private: + TSettings> GenerateThrottlers( + TSettings> settings); + + std::unique_ptr GenerateSetup(); + + TVector> IssuedControls; + TIntrusivePtr TimeProvider; + TFastRng64 Rng; + TSettings> CurrentSettings; +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampling_throttling_control.cpp b/ydb/core/jaeger_tracing/sampling_throttling_control.cpp new file mode 100644 index 000000000000..584eeb54d4aa --- /dev/null +++ b/ydb/core/jaeger_tracing/sampling_throttling_control.cpp @@ -0,0 +1,28 @@ +#include "sampling_throttling_control.h" + +#include "sampling_throttling_control_internals.h" + +namespace NKikimr::NJaegerTracing { + +TSamplingThrottlingControl::TSamplingThrottlingControl(std::unique_ptr initialImpl) + : Impl(std::move(initialImpl)) +{} + +TSamplingThrottlingControl::~TSamplingThrottlingControl() { + UpdateImpl(nullptr); +} + +void TSamplingThrottlingControl::HandleTracing(NWilson::TTraceId& traceId, const TRequestDiscriminator& discriminator) { + if (ImplUpdate.load(std::memory_order_relaxed)) { + auto newImpl = std::unique_ptr(ImplUpdate.exchange(nullptr, std::memory_order_relaxed)); + Y_ABORT_UNLESS(newImpl); + Impl = std::move(newImpl); + } + Impl->HandleTracing(traceId, discriminator); +} + +void TSamplingThrottlingControl::UpdateImpl(std::unique_ptr newImpl) { + std::unique_ptr guard(ImplUpdate.exchange(newImpl.release(), std::memory_order_relaxed)); +} + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampling_throttling_control.h b/ydb/core/jaeger_tracing/sampling_throttling_control.h new file mode 100644 index 000000000000..fce17a0730dd --- /dev/null +++ b/ydb/core/jaeger_tracing/sampling_throttling_control.h @@ -0,0 +1,32 @@ +#pragma once + +#include "request_discriminator.h" + +#include + +namespace NKikimr::NJaegerTracing { + +class TSamplingThrottlingControl: public TThrRefBase { + friend class TSamplingThrottlingConfigurator; + +public: + void HandleTracing(NWilson::TTraceId& traceId, const TRequestDiscriminator& discriminator); + + ~TSamplingThrottlingControl(); + +private: + struct TSamplingThrottlingImpl; + + // Should only be obtained from TSamplingThrottlingConfigurator + TSamplingThrottlingControl(std::unique_ptr initialImpl); + + void UpdateImpl(std::unique_ptr newParams); + + // Exclusively owned by the only thread, that may call HandleTracing + std::unique_ptr Impl; + + // Shared between the thread calling HandleTracing and the thread calling UpdateParams + std::atomic ImplUpdate{nullptr}; +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampling_throttling_control_internals.cpp b/ydb/core/jaeger_tracing/sampling_throttling_control_internals.cpp new file mode 100644 index 000000000000..d7de858ee588 --- /dev/null +++ b/ydb/core/jaeger_tracing/sampling_throttling_control_internals.cpp @@ -0,0 +1,62 @@ +#include "sampling_throttling_control_internals.h" + + +namespace NKikimr::NJaegerTracing { + +namespace { + +template +void ForEachMatchingRule(TRequestTypeRules& rules, const TMaybe& database, TAction&& action) { + for (auto& rule : rules.Global) { + action(rule); + } + if (database) { + if (auto databaseRules = rules.DatabaseRules.FindPtr(*database)) { + for (auto& rule : *databaseRules) { + action(rule); + } + } + } +} + +} // namespace anonymous + +void TSamplingThrottlingControl::TSamplingThrottlingImpl::HandleTracing( + NWilson::TTraceId& traceId, TRequestDiscriminator discriminator) { + auto requestType = static_cast(discriminator.RequestType); + auto database = std::move(discriminator.Database); + + if (traceId) { + bool throttle = true; + + ForEachMatchingRule( + Setup.ExternalThrottlingRules[requestType], database, + [&throttle](auto& throttlingRule) { + throttle = throttlingRule.Throttler->Throttle() && throttle; + }); + + if (throttle) { + traceId = {}; + } + } + + if (!traceId) { + TMaybe level; + ForEachMatchingRule( + Setup.SamplingRules[requestType], database, + [&level](auto& samplingRule) { + if (!samplingRule.Sampler.Sample() || samplingRule.Throttler->Throttle()) { + return; + } + if (!level || samplingRule.Level > *level) { + level = samplingRule.Level; + } + }); + + if (level) { + traceId = NWilson::TTraceId::NewTraceId(*level, Max()); + } + } +} + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/sampling_throttling_control_internals.h b/ydb/core/jaeger_tracing/sampling_throttling_control_internals.h new file mode 100644 index 000000000000..34a9ba9ffe11 --- /dev/null +++ b/ydb/core/jaeger_tracing/sampling_throttling_control_internals.h @@ -0,0 +1,25 @@ +#pragma once + +#include "sampler.h" +#include "throttler.h" +#include "sampling_throttling_control.h" +#include "settings.h" + +#include +#include + +#include + +namespace NKikimr::NJaegerTracing { + +struct TSamplingThrottlingControl::TSamplingThrottlingImpl { + TSamplingThrottlingImpl(TSettings>&& settings) + : Setup(std::move(settings)) + {} + + TSettings> Setup; + + void HandleTracing(NWilson::TTraceId& traceId, TRequestDiscriminator discriminator); +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/settings.h b/ydb/core/jaeger_tracing/settings.h new file mode 100644 index 000000000000..22b68df3bca3 --- /dev/null +++ b/ydb/core/jaeger_tracing/settings.h @@ -0,0 +1,165 @@ +#pragma once + +#include "request_discriminator.h" + +#include + +#include +#include + +namespace NKikimr::NJaegerTracing { + +struct TThrottlingSettings { + ui64 MaxTracesPerMinute; + ui64 MaxTracesBurst; +}; + +template +struct TSamplingRule { + ui8 Level; + TSampling Sampler; + TThrottling Throttler; + + template + auto MapSampler(TFunc&& f) const { + using TNewSamplingType = std::invoke_result_t; + + return TSamplingRule { + .Level = Level, + .Sampler = std::forward(f)(Sampler), + .Throttler = Throttler, + }; + } + + template + auto MapThrottler(TFunc&& f) const { + using TNewThrottlingType = std::invoke_result_t; + + return TSamplingRule { + .Level = Level, + .Sampler = Sampler, + .Throttler = std::forward(f)(Throttler), + }; + } +}; + +template +struct TExternalThrottlingRule { + TThrottling Throttler; + + template + auto MapThrottler(TFunc&& f) const { + using TNewThrottlingType = std::invoke_result_t; + + return TExternalThrottlingRule { + .Throttler = std::forward(f)(Throttler), + }; + } +}; + +template +struct TRequestTypeRules { + TStackVec Global; + THashMap> DatabaseRules; +}; + +template +using TRulesContainer = std::array, kRequestTypesCnt>; + +template +struct TSettings { +public: + TRulesContainer> SamplingRules; + TRulesContainer> ExternalThrottlingRules; + + template + auto MapSampler(TFunc&& f) const { + using TNewSamplingType = std::invoke_result_t; + + return TSettings { + .SamplingRules = MapContainerValues( + SamplingRules, + [&f](const auto& v) { + return v.MapSampler(f); + } + ), + .ExternalThrottlingRules = ExternalThrottlingRules, + }; + } + + template + auto MapThrottler(TFunc&& f) const { + using TNewThrottlingType = std::invoke_result_t; + + return TSettings { + .SamplingRules = MapContainerValues( + SamplingRules, + [&f](const auto& v) { + return v.MapThrottler(f); + } + ), + .ExternalThrottlingRules = MapContainerValues( + ExternalThrottlingRules, + [&f](const auto& v) { + return v.MapThrottler(f); + } + ), + }; + } + +private: + template + static auto MapValues(const TStackVec& v, TFunc&& f) { + using TResultValue = std::invoke_result_t; + + TStackVec result; + result.reserve(v.size()); + for (const auto& item : v) { + result.push_back(f(item)); + } + return result; + } + + template + static auto MapValues(const THashMap& m, TFunc&& f) { + using TResultValue = std::invoke_result_t; + + THashMap result; + result.reserve(m.size()); + for (const auto& [key, value] : m) { + result.emplace(key, f(value)); + } + return result; + } + + template + static auto MapValues(const std::array& v, TFunc&& f) { + using TResultValue = std::invoke_result_t; + + return [&v, &f](std::index_sequence) -> std::array { + return { f(v[I])...}; + }(std::make_index_sequence()); + } + + template + static TRulesContainer> MapContainerValues(const TRulesContainer& v, TFunc&& f) { + using TResultValue = std::invoke_result_t; + + return MapValues( + v, + [&f](const TRequestTypeRules& reqTypeRules) { + return TRequestTypeRules { + .Global = MapValues(reqTypeRules.Global, f), + .DatabaseRules = MapValues( + reqTypeRules.DatabaseRules, + [&f](const auto& dbSamplingRules) { + return MapValues(dbSamplingRules, f); + } + ), + }; + } + ); + } +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/throttler.cpp b/ydb/core/jaeger_tracing/throttler.cpp new file mode 100644 index 000000000000..d361e9b053e6 --- /dev/null +++ b/ydb/core/jaeger_tracing/throttler.cpp @@ -0,0 +1,78 @@ +#include "throttler.h" + +#include + +namespace NKikimr::NJaegerTracing { + +TThrottler::TThrottler(ui64 maxRatePerMinute, ui64 maxBurst, TIntrusivePtr timeProvider) + : MaxTracesPerMinute(maxRatePerMinute) + , MaxTracesBurst(maxBurst + 1) + , BetweenSends(TDuration::Minutes(1).MicroSeconds() / MaxTracesPerMinute) + , TimeProvider(std::move(timeProvider)) + , EffectiveTs(TimeProvider->Now().MicroSeconds()) +{} + +bool TThrottler::Throttle() { + auto now = TimeProvider->Now().MicroSeconds(); + auto ts = EffectiveTs.load(std::memory_order_relaxed); + auto maxFinalTs = ClampAdd(now, ClampMultiply(BetweenSends, MaxTracesBurst)); + while (true) { + if (ts < now) { + if (EffectiveTs.compare_exchange_weak(ts, now + BetweenSends, std::memory_order_relaxed)) { + return false; + } + SpinLockPause(); + } else if (ts + BetweenSends > maxFinalTs) { + return true; + } else if (EffectiveTs.fetch_add(BetweenSends, std::memory_order_relaxed) + BetweenSends > maxFinalTs) { + EffectiveTs.fetch_sub(BetweenSends, std::memory_order_relaxed); + return true; + } else { + return false; + } + } +} + +ui64 TThrottler::ClampAdd(ui64 a, ui64 b) { +#if defined(__has_builtin) && __has_builtin(__builtin_add_overflow) + + ui64 res; + if (__builtin_add_overflow(a, b, &res)) { + return Max(); + } else { + return res; + } + +#else + + if (a > Max() - b) { + return Max(); + } + return a + b; + +#endif +} + +ui64 TThrottler::ClampMultiply(ui64 a, ui64 b) { +#if defined(__has_builtin) && __has_builtin(__builtin_mul_overflow) + + ui64 res; + if (__builtin_mul_overflow(a, b, &res)) { + return Max(); + } else { + return res; + } + +#else + + ui128 prod = a; + prod *= b; + if (prod > Max()) { + return Max(); + } + return static_cast(prod); + +#endif +} + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/throttler.h b/ydb/core/jaeger_tracing/throttler.h new file mode 100644 index 000000000000..c0b39913ccab --- /dev/null +++ b/ydb/core/jaeger_tracing/throttler.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +namespace NKikimr::NJaegerTracing { + +class TThrottler: public TThrRefBase { +public: + TThrottler(ui64 maxRatePerMinute, ui64 maxBurst, TIntrusivePtr timeProvider); + + bool Throttle(); + +private: + static ui64 ClampAdd(ui64 a, ui64 b); + static ui64 ClampMultiply(ui64 a, ui64 b); + + const ui64 MaxTracesPerMinute; + const ui64 MaxTracesBurst; + const ui64 BetweenSends; + TIntrusivePtr TimeProvider; + std::atomic EffectiveTs; + + static_assert(decltype(EffectiveTs)::is_always_lock_free); +}; + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/throttler_ut.cpp b/ydb/core/jaeger_tracing/throttler_ut.cpp new file mode 100644 index 000000000000..aed397f09411 --- /dev/null +++ b/ydb/core/jaeger_tracing/throttler_ut.cpp @@ -0,0 +1,175 @@ +#include "throttler.h" + +#include +#include + +#include + +#include + +namespace NKikimr::NJaegerTracing { + +class TTimeProviderMock : public ITimeProvider { +public: + TTimeProviderMock(TInstant now) : CurrentTimeUS(now.GetValue()) {} + + void Advance(TDuration delta) { + CurrentTimeUS.fetch_add(delta.GetValue()); + } + + TInstant Now() final { + return TInstant::FromValue(CurrentTimeUS.load()); + } + +private: + std::atomic CurrentTimeUS; +}; + +Y_UNIT_TEST_SUITE(ThrottlerControlTests) { + void CheckAtLeast(TThrottler& throttler, ui32 n) { + for (ui32 i = 0; i < n; ++i) { + UNIT_ASSERT(!throttler.Throttle()); + } + } + + void CheckExact(TThrottler& throttler, ui32 n) { + CheckAtLeast(throttler, n); + UNIT_ASSERT(throttler.Throttle()); + } + + Y_UNIT_TEST(Simple) { + auto timeProvider = MakeIntrusive(TInstant::Now()); + + TThrottler throttler(6, 2, timeProvider); + CheckExact(throttler, 3); + CheckExact(throttler, 0); + + timeProvider->Advance(TDuration::Seconds(9)); + CheckExact(throttler, 0); + timeProvider->Advance(TDuration::Seconds(1)); + CheckExact(throttler, 1); + + timeProvider->Advance(TDuration::Seconds(15)); + CheckExact(throttler, 1); + + timeProvider->Advance(TDuration::Seconds(15)); + CheckExact(throttler, 2); + } + + Y_UNIT_TEST(LongIdle) { + auto timeProvider = MakeIntrusive(TInstant::Now()); + + TThrottler throttler(10, 2, timeProvider); + CheckAtLeast(throttler, 3); + + timeProvider->Advance(TDuration::Hours(1)); + CheckExact(throttler, 3); + } + + Y_UNIT_TEST(Overflow_1) { + auto timeProvider = MakeIntrusive(TInstant::Now()); + + TThrottler throttler(1'000'000'000'000'000'000, 20'000, timeProvider); + + // TODO(pumpurum): switch back to CheckExact when we figure out how to limit properly + CheckAtLeast(throttler, 20'001); + + timeProvider->Advance(TDuration::Days(365 * 10)); + + CheckAtLeast(throttler, 20'001); + } + + Y_UNIT_TEST(Overflow_2) { + auto timeProvider = MakeIntrusive(TInstant::Now()); + + TThrottler throttler(1'000'000'000'000'000'000, 1, timeProvider); + CheckAtLeast(throttler, 1); + + timeProvider->Advance(TDuration::Days(365 * 10)); + + CheckAtLeast(throttler, 1); + } + + void TestMultiThreaded(ui32 threads, ui64 ticks, ui64 init, ui64 step) { + constexpr std::array delays = { + TDuration::MilliSeconds(1), + TDuration::MilliSeconds(10), + TDuration::MilliSeconds(100), + TDuration::Seconds(1) + }; + + auto timeProvider = MakeIntrusive(TInstant::Now()); + + TThrottler throttler(60, init - 1, timeProvider); + + auto shouldStop = std::make_shared>(false); + TVector> workers; + Y_SCOPE_EXIT(shouldStop, &workers) { + shouldStop->store(true); + + try { + for (auto& worker : workers) { + worker->Join(); + } + } catch (yexception& e) { + Cerr << "Failed to join worker:" << Endl; + Cerr << e.what() << Endl; + } + }; + + std::atomic totalConsumed{0}; + workers.reserve(threads); + for (size_t i = 0; i < threads; ++i) { + workers.push_back(MakeHolder([&]() { + while (!shouldStop->load(std::memory_order_relaxed)) { + if (!throttler.Throttle()) { + totalConsumed.fetch_add(1); + } + } + })); + } + for (auto& worker : workers) { + worker->Start(); + } + + auto waitForIncrease = [&](ui64 expected) -> bool { + for (const TDuration& delay : delays) { + Sleep(delay); + if (totalConsumed.load() == expected) { + return true; + } + } + return false; + }; + + ui64 expected = init; + UNIT_ASSERT(waitForIncrease(expected)); + + auto advance = [&](ui64 seconds, ui64 expectedIncrease) { + timeProvider->Advance(TDuration::Seconds(seconds)); + expected += expectedIncrease; + UNIT_ASSERT(waitForIncrease(expected)); + }; + + advance(1, 1); + + for (size_t i = 0; i < ticks; ++i) { + advance(step, step); + } + + advance(init + 1000, init); + } + + #define TEST_MULTI_THREADED(threads, ticks, init, step) \ + Y_UNIT_TEST(MultiThreaded##threads##Threads##ticks##Ticks##init##Init##step##Step) { \ + TestMultiThreaded(threads, ticks, init, step); \ + } + + TEST_MULTI_THREADED(2, 200, 30, 7); + TEST_MULTI_THREADED(5, 150, 500, 15); + TEST_MULTI_THREADED(10, 100, 1000, 22); + + #undef TEST_MULTI_THREADED +} + +} // namespace NKikimr::NJaegerTracing diff --git a/ydb/core/jaeger_tracing/ut/ya.make b/ydb/core/jaeger_tracing/ut/ya.make new file mode 100644 index 000000000000..a3b67286e633 --- /dev/null +++ b/ydb/core/jaeger_tracing/ut/ya.make @@ -0,0 +1,14 @@ +UNITTEST_FOR(ydb/core/jaeger_tracing) + +FORK_SUBTESTS() + +TIMEOUT(600) + +SIZE(MEDIUM) + +SRCS( + sampler_ut.cpp + throttler_ut.cpp +) + +END() diff --git a/ydb/core/jaeger_tracing/ya.make b/ydb/core/jaeger_tracing/ya.make new file mode 100644 index 000000000000..09da909d3bb8 --- /dev/null +++ b/ydb/core/jaeger_tracing/ya.make @@ -0,0 +1,26 @@ +LIBRARY() + +PEERDIR( + ydb/core/protos +) + +SRCS( + request_discriminator.h + request_discriminator.cpp + sampler.h + sampling_throttling_configurator.cpp + sampling_throttling_configurator.h + sampling_throttling_control.cpp + sampling_throttling_control.h + sampling_throttling_control_internals.cpp + sampling_throttling_control_internals.h + settings.h + throttler.h + throttler.cpp +) + +END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/core/keyvalue/keyvalue_intermediate.cpp b/ydb/core/keyvalue/keyvalue_intermediate.cpp index ed8fa1c355f0..b48d35ef6261 100644 --- a/ydb/core/keyvalue/keyvalue_intermediate.cpp +++ b/ydb/core/keyvalue/keyvalue_intermediate.cpp @@ -80,7 +80,7 @@ TIntermediate::TIntermediate(TActorId respondTo, TActorId keyValueActorId, ui64 , CreatedAtGeneration(channelGeneration) , CreatedAtStep(channelStep) , IsReplied(false) - , Span(TWilsonTablet::Tablet, std::move(traceId), "KeyValue.Intermediate", NWilson::EFlags::AUTO_END) + , Span(TWilsonTablet::TabletTopLevel, std::move(traceId), "KeyValue.Intermediate", NWilson::EFlags::AUTO_END) { Stat.IntermediateCreatedAt = TAppData::TimeProvider->Now(); Stat.RequestType = requestType; @@ -106,7 +106,7 @@ void TIntermediate::UpdateStat() { } } } else { - Stat.IndexRangeRead++; + Stat.IndexRangeRead++; } }; diff --git a/ydb/core/keyvalue/keyvalue_storage_read_request.cpp b/ydb/core/keyvalue/keyvalue_storage_read_request.cpp index 4f2eff31c654..d93ac00aa091 100644 --- a/ydb/core/keyvalue/keyvalue_storage_read_request.cpp +++ b/ydb/core/keyvalue/keyvalue_storage_read_request.cpp @@ -511,7 +511,7 @@ class TKeyValueStorageReadRequest : public TActorBootstrapped(tabletInfo)) , TabletGeneration(tabletGeneration) - , Span(TWilsonTablet::Tablet, IntermediateResult->Span.GetTraceId(), "KeyValue.StorageReadRequest") + , Span(TWilsonTablet::TabletBasic, IntermediateResult->Span.GetTraceId(), "KeyValue.StorageReadRequest") {} }; diff --git a/ydb/core/keyvalue/keyvalue_storage_request.cpp b/ydb/core/keyvalue/keyvalue_storage_request.cpp index 5f2b434b004e..60e07849c1c9 100644 --- a/ydb/core/keyvalue/keyvalue_storage_request.cpp +++ b/ydb/core/keyvalue/keyvalue_storage_request.cpp @@ -88,7 +88,7 @@ class TKeyValueStorageRequest : public TActorBootstrapped(tabletInfo)) - , Span(TWilsonTablet::Tablet, IntermediateResults->Span.GetTraceId(), "KeyValue.StorageRequest") + , Span(TWilsonTablet::TabletBasic, IntermediateResults->Span.GetTraceId(), "KeyValue.StorageRequest") { IntermediateResults->Stat.KeyvalueStorageRequestSentAt = TAppData::TimeProvider->Now(); } diff --git a/ydb/core/keyvalue/ya.make b/ydb/core/keyvalue/ya.make index 740eae043d01..3a014e31316d 100644 --- a/ydb/core/keyvalue/ya.make +++ b/ydb/core/keyvalue/ya.make @@ -45,7 +45,6 @@ PEERDIR( ydb/library/actors/protos ydb/core/base ydb/core/blobstorage/base - ydb/core/control/common_controls ydb/core/engine/minikql ydb/core/keyvalue/protos ydb/core/protos diff --git a/ydb/core/kqp/session_actor/kqp_query_state.h b/ydb/core/kqp/session_actor/kqp_query_state.h index 48f32a7342a2..353e1314601e 100644 --- a/ydb/core/kqp/session_actor/kqp_query_state.h +++ b/ydb/core/kqp/session_actor/kqp_query_state.h @@ -63,7 +63,7 @@ class TKqpQueryState : public TNonCopyable { SetQueryDeadlines(tableServiceConfig, queryServiceConfig); auto action = GetAction(); KqpSessionSpan = NWilson::TSpan( - TWilsonKqp::KqpSession, std::move(RequestEv->GetWilsonTraceId()), + TWilsonKqp::KqpSession, std::move(ev->TraceId), "Session.query." + NKikimrKqp::EQueryAction_Name(action), NWilson::EFlags::AUTO_END); if (RequestEv->GetUserRequestContext()) { UserRequestContext = RequestEv->GetUserRequestContext(); diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index b7f023240eec..ec5ce939a55f 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1248,55 +1248,6 @@ message TImmediateControlsConfig { DefaultValue: 8388608 }]; } - message TTracingControls { - message TSamplingThrottlingOptions { - message TThrottlingOptions { - optional uint64 MaxRatePerMinute = 1 [(ControlOptions) = { - Description: "Maximum amount of traced requests per minute", - MinValue: 0, - MaxValue: 300, - DefaultValue: 0, - }]; - optional uint64 MaxBurst = 2 [(ControlOptions) = { - Description: "Maximum burst of traced events", - MinValue: 0, - MaxValue: 300, - DefaultValue: 0, - }]; - } - - message TSamplingOptions { - optional uint64 PPM = 1 [(ControlOptions) = { - Description: "Average amount of sampled requests per one million", - MinValue: 0, - MaxValue: 1000000, - DefaultValue: 0, - }]; - optional uint64 Level = 2 [(ControlOptions) = { - Description: "Tracing level of sampled requests", - MinValue: 1, - MaxValue: 15, - DefaultValue: 15, - }]; - } - - optional TSamplingOptions Sampling = 1; - optional TThrottlingOptions SampledThrottling = 2; - optional TThrottlingOptions ExternalThrottling = 3; - } - - message TKeyValue { - optional TSamplingThrottlingOptions AcquireLock = 1; - optional TSamplingThrottlingOptions ExecuteTransaction = 2; - optional TSamplingThrottlingOptions Read = 3; - optional TSamplingThrottlingOptions ReadRange = 4; - optional TSamplingThrottlingOptions ListRange = 5; - optional TSamplingThrottlingOptions GetStorageChannelStatus = 6; - } - - optional TKeyValue KeyValue = 1; - } - message TVDiskControls { // SyncLog Data cutter options, not merged to 24-1 reserved 1; @@ -1350,7 +1301,7 @@ message TImmediateControlsConfig { optional TCoordinatorControls CoordinatorControls = 3; optional TSchemeShardControls SchemeShardControls = 4; optional TTCMallocControls TCMallocControls = 5; - optional TTracingControls TracingControls = 6; + reserved 6; optional TVDiskControls VDiskControls = 7; optional TTabletControls TabletControls = 8; }; @@ -1658,33 +1609,88 @@ message TBackgroundCleaningConfig { } message TTracingConfig { - message TAuthConfig { - message TTvm { - optional string Host = 1; - optional uint32 Port = 2; + message TBackendConfig { + message TAuthConfig { + message TTvmAuth { + optional string Url = 1; + + optional uint32 SelfTvmId = 2; + optional uint32 TracingTvmId = 3; - required uint32 SelfTvmId = 3; - required uint32 TracingTvmId = 4; + optional string DiskCacheDir = 4; - optional string DiskCacheDir = 5; + oneof Secret { + string PlainTextSecret = 5; + string SecretFile = 6; + string SecretEnvironmentVariable = 7; + } + } - oneof Secret { - string PlainTextSecret = 6; - string SecretFile = 7; - string SecretEnvironmentVariable = 8; + oneof Method { + TTvmAuth Tvm = 1; } } - oneof Method { - TTvm Tvm = 1; + message TOpentelemetryBackend { + optional string CollectorUrl = 1; + optional string ServiceName = 2; + } + + + oneof Backend { + TOpentelemetryBackend Opentelemetry = 1; } + optional TAuthConfig AuthConfig = 2; } - optional string Host = 1; - optional uint32 Port = 2; - optional string RootCA = 3; - optional string ServiceName = 4; - optional TAuthConfig AuthConfig = 5; + message TSelectors { + reserved 1; + repeated string RequestTypes = 3; + optional string Database = 2; + } + + message TSamplingRule { + // scope to which the rule applies + optional TSelectors Scope = 1; + // fraction of requests sampled by this rule + optional float Fraction = 2; + // detalisation of traces sampled by this rule + optional uint32 Level = 3; + // maximum average amount of traces sampled by this rule + optional uint64 MaxTracesPerMinute = 4; + // maximum burst of traces sampled by this rule + optional uint64 MaxTracesBurst = 5; + } + + // field meaning is the same as in TSamplingRule + message TExternalThrottlingRule { + optional TSelectors Scope = 1; + optional uint64 MaxTracesPerMinute = 2; + optional uint64 MaxTracesBurst = 3; + } + + message TUploaderConfig { + // maximum average amount of spans uploaded from the node + optional uint64 MaxExportedSpansPerSecond = 1; + // maximum batch size in spans + optional uint64 MaxSpansInBatch = 2; + // maximum batch size in bytes + optional uint64 MaxBytesInBatch = 3; + // maximum batch accumulation time + optional uint64 MaxBatchAccumulationMilliseconds = 4; + // time after which generated span will be discarded and will + // not be sent to the collector + optional uint32 SpanExportTimeoutSeconds = 5; + // maximum batch export requests being run simultaneously + optional uint64 MaxExportRequestsInflight = 6; + } + + reserved 1 to 5; + + optional TBackendConfig Backend = 6; + repeated TSamplingRule Sampling = 7; + repeated TExternalThrottlingRule ExternalThrottling = 8; + optional TUploaderConfig Uploader = 9; } message TFailureInjectionConfig { diff --git a/ydb/core/tablet/tablet_req_writelog.cpp b/ydb/core/tablet/tablet_req_writelog.cpp index 864755528b60..619facd2dcd9 100644 --- a/ydb/core/tablet/tablet_req_writelog.cpp +++ b/ydb/core/tablet/tablet_req_writelog.cpp @@ -154,7 +154,7 @@ class TTabletReqWriteLog : public TActorBootstrapped { , CommitTactic(commitTactic) , Info(info) , RepliesToWait(Max()) - , RequestSpan(TWilsonTablet::Tablet, std::move(traceId), "Tablet.WriteLog") + , RequestSpan(TWilsonTablet::TabletDetailed, std::move(traceId), "Tablet.WriteLog") { References.swap(refs); Y_ABORT_UNLESS(Info); @@ -171,9 +171,9 @@ class TTabletReqWriteLog : public TActorBootstrapped { NWilson::TTraceId innerTraceId; if (RequestSpan) { - auto res = BlobSpans.try_emplace(ref.Id, TWilsonTablet::Tablet, RequestSpan.GetTraceId(), "Tablet.WriteLog.Reference"); + auto res = BlobSpans.try_emplace(ref.Id, TWilsonTablet::TabletDetailed, RequestSpan.GetTraceId(), "Tablet.WriteLog.Reference"); - innerTraceId = std::move(res.first->second.GetTraceId()); + innerTraceId = res.first->second.GetTraceId(); } SendToBS(ref.Id, ref.Buffer, ctx, handleClass, ref.Tactic ? *ref.Tactic : CommitTactic, std::move(innerTraceId)); @@ -191,7 +191,7 @@ class TTabletReqWriteLog : public TActorBootstrapped { NWilson::TTraceId traceId; if (RequestSpan) { - auto res = BlobSpans.try_emplace(actualLogEntryId, TWilsonTablet::Tablet, RequestSpan.GetTraceId(), "Tablet.WriteLog.LogEntry"); + auto res = BlobSpans.try_emplace(actualLogEntryId, TWilsonTablet::TabletDetailed, RequestSpan.GetTraceId(), "Tablet.WriteLog.LogEntry"); traceId = std::move(res.first->second.GetTraceId()); } diff --git a/ydb/core/tablet_flat/flat_exec_seat.h b/ydb/core/tablet_flat/flat_exec_seat.h index 500b711fadbd..2eef8f0c59f6 100644 --- a/ydb/core/tablet_flat/flat_exec_seat.h +++ b/ydb/core/tablet_flat/flat_exec_seat.h @@ -35,7 +35,7 @@ namespace NTabletFlatExecutor { void Terminate(ETerminationReason reason, const TActorContext& ctx) noexcept; void StartEnqueuedSpan() noexcept { - WaitingSpan = NWilson::TSpan(TWilsonTablet::Tablet, Self->TxSpan.GetTraceId(), "Tablet.Transaction.Enqueued"); + WaitingSpan = NWilson::TSpan(TWilsonTablet::TabletDetailed, Self->TxSpan.GetTraceId(), "Tablet.Transaction.Enqueued"); } void FinishEnqueuedSpan() noexcept { @@ -43,7 +43,7 @@ namespace NTabletFlatExecutor { } void CreatePendingSpan() noexcept { - WaitingSpan = NWilson::TSpan(TWilsonTablet::Tablet, Self->TxSpan.GetTraceId(), "Tablet.Transaction.Pending"); + WaitingSpan = NWilson::TSpan(TWilsonTablet::TabletDetailed, Self->TxSpan.GetTraceId(), "Tablet.Transaction.Pending"); } void FinishPendingSpan() noexcept { diff --git a/ydb/core/tablet_flat/flat_executor.cpp b/ydb/core/tablet_flat/flat_executor.cpp index d46455f49554..fbc8e7f45c47 100644 --- a/ydb/core/tablet_flat/flat_executor.cpp +++ b/ydb/core/tablet_flat/flat_executor.cpp @@ -4241,7 +4241,7 @@ TString TExecutor::CheckBorrowConsistency() { TTransactionWaitPad::TTransactionWaitPad(THolder seat) : Seat(std::move(seat)) - , WaitingSpan(NWilson::TSpan(TWilsonTablet::Tablet, Seat->GetTxTraceId(), "Tablet.Transaction.Wait")) + , WaitingSpan(NWilson::TSpan(TWilsonTablet::TabletDetailed, Seat->GetTxTraceId(), "Tablet.Transaction.Wait")) {} TTransactionWaitPad::~TTransactionWaitPad() diff --git a/ydb/core/tablet_flat/tablet_flat_executor.h b/ydb/core/tablet_flat/tablet_flat_executor.h index edd006ed266f..c099417b9d94 100644 --- a/ydb/core/tablet_flat/tablet_flat_executor.h +++ b/ydb/core/tablet_flat/tablet_flat_executor.h @@ -229,7 +229,7 @@ class TTransactionContext : public TTxMemoryProviderBase { } void StartExecutionSpan() noexcept { - TransactionExecutionSpan = NWilson::TSpan(TWilsonTablet::Tablet, TransactionSpan.GetTraceId(), "Tablet.Transaction.Execute"); + TransactionExecutionSpan = NWilson::TSpan(TWilsonTablet::TabletDetailed, TransactionSpan.GetTraceId(), "Tablet.Transaction.Execute"); } void FinishExecutionSpan() noexcept { @@ -289,7 +289,7 @@ class ITransaction : TNonCopyable { { } ITransaction(NWilson::TTraceId &&traceId) - : TxSpan(NWilson::TSpan(TWilsonTablet::Tablet, std::move(traceId), "Tablet.Transaction")) + : TxSpan(NWilson::TSpan(TWilsonTablet::TabletBasic, std::move(traceId), "Tablet.Transaction")) { } virtual ~ITransaction() = default; @@ -312,8 +312,10 @@ class ITransaction : TNonCopyable { } void SetupTxSpan(NWilson::TTraceId traceId) noexcept { - TxSpan = NWilson::TSpan(TWilsonTablet::Tablet, std::move(traceId), "Tablet.Transaction"); - TxSpan.Attribute("Type", TypeName(*this)); + TxSpan = NWilson::TSpan(TWilsonTablet::TabletBasic, std::move(traceId), "Tablet.Transaction"); + if (TxSpan) { + TxSpan.Attribute("Type", TypeName(*this)); + } } public: diff --git a/ydb/core/testlib/test_client.cpp b/ydb/core/testlib/test_client.cpp index f6816c4063da..9cb347017589 100644 --- a/ydb/core/testlib/test_client.cpp +++ b/ydb/core/testlib/test_client.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -306,14 +307,19 @@ namespace Tests { grpcRequestProxies.reserve(proxyCount); auto& appData = Runtime->GetAppData(); + NJaegerTracing::TSamplingThrottlingConfigurator tracingConfigurator(appData.TimeProvider, appData.RandomProvider); for (size_t i = 0; i < proxyCount; ++i) { - auto grpcRequestProxy = NGRpcService::CreateGRpcRequestProxy(*Settings->AppConfig, appData.Icb); + auto grpcRequestProxy = NGRpcService::CreateGRpcRequestProxy(*Settings->AppConfig, tracingConfigurator.GetControl()); auto grpcRequestProxyId = system->Register(grpcRequestProxy, TMailboxType::ReadAsFilled); system->RegisterLocalService(NGRpcService::CreateGRpcRequestProxyId(), grpcRequestProxyId); grpcRequestProxies.push_back(grpcRequestProxyId); } + system->Register( + NConsole::CreateJaegerTracingConfigurator(std::move(tracingConfigurator), Settings->AppConfig->GetTracingConfig()) + ); + auto grpcMon = system->Register(NGRpcService::CreateGrpcMonService(), TMailboxType::ReadAsFilled); system->RegisterLocalService(NGRpcService::GrpcMonServiceId(), grpcMon); diff --git a/ydb/core/tx/datashard/datashard.cpp b/ydb/core/tx/datashard/datashard.cpp index d78e5f7126ef..d3e1afb78ac8 100644 --- a/ydb/core/tx/datashard/datashard.cpp +++ b/ydb/core/tx/datashard/datashard.cpp @@ -2947,8 +2947,10 @@ void TDataShard::ProposeTransaction(TEvDataShard::TEvProposeTransaction::TPtr && UpdateProposeQueueSize(); } else { // Prepare planned transactions as soon as possible - NWilson::TSpan datashardTransactionSpan(TWilsonTablet::Tablet, std::move(ev->TraceId), "Datashard.Transaction", NWilson::EFlags::AUTO_END); - datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + NWilson::TSpan datashardTransactionSpan(TWilsonTablet::TabletTopLevel, std::move(ev->TraceId), "Datashard.Transaction", NWilson::EFlags::AUTO_END); + if (datashardTransactionSpan) { + datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + } Execute(new TTxProposeTransactionBase(this, std::move(ev), TAppData::TimeProvider->Now(), NextTieBreakerIndex++, /* delayed */ false, std::move(datashardTransactionSpan)), ctx); } @@ -2968,8 +2970,10 @@ void TDataShard::ProposeTransaction(NEvents::TDataEvents::TEvWrite::TPtr&& ev, c UpdateProposeQueueSize(); } else { // Prepare planned transactions as soon as possible - NWilson::TSpan datashardTransactionSpan(TWilsonTablet::Tablet, std::move(ev->TraceId), "Datashard.WriteTransaction", NWilson::EFlags::AUTO_END); - datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + NWilson::TSpan datashardTransactionSpan(TWilsonTablet::TabletTopLevel, std::move(ev->TraceId), "Datashard.WriteTransaction", NWilson::EFlags::AUTO_END); + if (datashardTransactionSpan) { + datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + } Execute(new TTxWrite(this, std::move(ev), TAppData::TimeProvider->Now(), NextTieBreakerIndex++, /* delayed */ false, std::move(datashardTransactionSpan)), ctx); } @@ -3036,16 +3040,20 @@ void TDataShard::Handle(TEvPrivate::TEvDelayedProposeTransaction::TPtr &ev, cons switch (item.Event->GetTypeRewrite()) { case TEvDataShard::TEvProposeTransaction::EventType: { auto event = IEventHandle::Downcast(std::move(item.Event)); - NWilson::TSpan datashardTransactionSpan(TWilsonTablet::Tablet, std::move(event->TraceId), "Datashard.Transaction", NWilson::EFlags::AUTO_END); - datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + NWilson::TSpan datashardTransactionSpan(TWilsonTablet::TabletTopLevel, std::move(event->TraceId), "Datashard.Transaction", NWilson::EFlags::AUTO_END); + if (datashardTransactionSpan) { + datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + } Execute(new TTxProposeTransactionBase(this, std::move(event), item.ReceivedAt, item.TieBreakerIndex, /* delayed */ true, std::move(datashardTransactionSpan)), ctx); return; } case NEvents::TDataEvents::TEvWrite::EventType: { auto event = IEventHandle::Downcast(std::move(item.Event)); - NWilson::TSpan datashardTransactionSpan(TWilsonTablet::Tablet, std::move(event->TraceId), "Datashard.WriteTransaction", NWilson::EFlags::AUTO_END); - datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + NWilson::TSpan datashardTransactionSpan(TWilsonTablet::TabletTopLevel, std::move(event->TraceId), "Datashard.WriteTransaction", NWilson::EFlags::AUTO_END); + if (datashardTransactionSpan) { + datashardTransactionSpan.Attribute("Shard", std::to_string(TabletID())); + } Execute(new TTxWrite(this, std::move(event), item.ReceivedAt, item.TieBreakerIndex, /* delayed */ true, std::move(datashardTransactionSpan)), ctx); return; diff --git a/ydb/core/tx/datashard/datashard__op_rows.cpp b/ydb/core/tx/datashard/datashard__op_rows.cpp index 5cee5d62acab..0372e649947e 100644 --- a/ydb/core/tx/datashard/datashard__op_rows.cpp +++ b/ydb/core/tx/datashard/datashard__op_rows.cpp @@ -16,7 +16,7 @@ class TTxDirectBase : public TTransactionBase { public: TTxDirectBase(TDataShard* ds, TEvRequest ev) - : TBase(ds) + : TBase(ds, std::move(ev->TraceId)) , Ev(ev) { } diff --git a/ydb/core/tx/datashard/datashard__read_iterator.cpp b/ydb/core/tx/datashard/datashard__read_iterator.cpp index 2c2cff174e9d..e59adc95af56 100644 --- a/ydb/core/tx/datashard/datashard__read_iterator.cpp +++ b/ydb/core/tx/datashard/datashard__read_iterator.cpp @@ -2554,7 +2554,7 @@ void TDataShard::Handle(TEvDataShard::TEvRead::TPtr& ev, const TActorContext& ct auto* request = ev->Get(); if (!request->ReadSpan) { - request->ReadSpan = NWilson::TSpan(TWilsonTablet::Tablet, std::move(ev->TraceId), "Datashard.Read", NWilson::EFlags::AUTO_END); + request->ReadSpan = NWilson::TSpan(TWilsonTablet::TabletTopLevel, std::move(ev->TraceId), "Datashard.Read", NWilson::EFlags::AUTO_END); request->ReadSpan.Attribute("Shard", std::to_string(TabletID())); } diff --git a/ydb/core/tx/datashard/datashard_pipeline.cpp b/ydb/core/tx/datashard/datashard_pipeline.cpp index 15cfdc4c88f7..e0f8d2d2c2bd 100644 --- a/ydb/core/tx/datashard/datashard_pipeline.cpp +++ b/ydb/core/tx/datashard/datashard_pipeline.cpp @@ -1741,7 +1741,7 @@ EExecutionStatus TPipeline::RunExecutionPlan(TOperation::TPtr op, return EExecutionStatus::Reschedule; } - NWilson::TSpan unitSpan(TWilsonTablet::Tablet, txc.TransactionExecutionSpan.GetTraceId(), "Datashard.Unit"); + NWilson::TSpan unitSpan(TWilsonTablet::TabletDetailed, txc.TransactionExecutionSpan.GetTraceId(), "Datashard.Unit"); NCpuTime::TCpuTimer timer; auto status = unit.Execute(op, txc, ctx); diff --git a/ydb/core/tx/datashard/datashard_ut_trace.cpp b/ydb/core/tx/datashard/datashard_ut_trace.cpp index b7be979310cc..3eb2127496da 100644 --- a/ydb/core/tx/datashard/datashard_ut_trace.cpp +++ b/ydb/core/tx/datashard/datashard_ut_trace.cpp @@ -31,79 +31,8 @@ Y_UNIT_TEST_SUITE(TDataShardTrace) { auto &runtime = *server->GetRuntime(); TAutoPtr handle; - THolder request; - if (traceId) { - struct RequestCtx : NGRpcService::IRequestCtxMtSafe { - RequestCtx(NWilson::TTraceId &&traceId) : TraceId(std::move(traceId)) {} - - NWilson::TTraceId GetWilsonTraceId() const override { - return TraceId.Clone(); - } - - TMaybe GetTraceId() const override { - return Nothing(); - } - - const TMaybe GetDatabaseName() const override { - return ""; - } - - const TIntrusiveConstPtr& GetInternalToken() const override { - return Ptr; - } - - const TString& GetSerializedToken() const override { - return Token; - } - - bool IsClientLost() const override { - return false; - }; - - virtual const google::protobuf::Message* GetRequest() const override { - return nullptr; - }; - - const TMaybe GetRequestType() const override { - return "_document_api_request"; - }; - - void SetFinishAction(std::function&& cb) override { - Y_UNUSED(cb); - }; - - google::protobuf::Arena* GetArena() override { - return nullptr; - }; - - TIntrusiveConstPtr Ptr; - TString Token; - NWilson::TTraceId TraceId; - }; - - auto *txControl = google::protobuf::Arena::CreateMessage(&arena); - txControl->mutable_begin_tx()->mutable_serializable_read_write(); - txControl->set_commit_tx(true); - - auto ptr = std::make_shared(std::move(traceId)); - request = MakeHolder( - NKikimrKqp::QUERY_ACTION_EXECUTE, - NKikimrKqp::QUERY_TYPE_SQL_DML, - TActorId(), - ptr, - TString(), //sessionId - TString(sql), - TString(), //queryId - txControl, //tx_control - nullptr, //ydbParameters - Ydb::Table::QueryStatsCollection::STATS_COLLECTION_UNSPECIFIED, //collectStats - nullptr, // query_cache_policy - nullptr //operationParams - ); - } else { - request = MakeSQLRequest(sql, true); - } - runtime.Send(new IEventHandle(NKqp::MakeKqpProxyID(runtime.GetNodeId()), sender, request.Release(), 0, 0, nullptr)); + THolder request = MakeSQLRequest(sql, true); + runtime.Send(new IEventHandle(NKqp::MakeKqpProxyID(runtime.GetNodeId()), sender, request.Release(), 0, 0, nullptr, std::move(traceId))); auto ev = runtime.GrabEdgeEventRethrow(sender); UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Record.GetRef().GetYdbStatus(), code); } diff --git a/ydb/core/tx/datashard/export_common.h b/ydb/core/tx/datashard/export_common.h index df3434a45cc7..631e8c71e1b9 100644 --- a/ydb/core/tx/datashard/export_common.h +++ b/ydb/core/tx/datashard/export_common.h @@ -3,6 +3,7 @@ #include "datashard_user_table.h" #include +#include #include #include diff --git a/ydb/core/tx/tx_proxy/upload_rows_common_impl.h b/ydb/core/tx/tx_proxy/upload_rows_common_impl.h index 16f4964299b5..0826905f9069 100644 --- a/ydb/core/tx/tx_proxy/upload_rows_common_impl.h +++ b/ydb/core/tx/tx_proxy/upload_rows_common_impl.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -29,6 +28,8 @@ #undef INCLUDE_YDB_INTERNAL_H #include +#include +#include #include #include @@ -208,18 +209,21 @@ class TUploadRowsBase : public TActorBootstrapped Batch; float RuCost = 0.0; + NWilson::TSpan Span; + public: static constexpr NKikimrServices::TActivity::EType ActorActivityType() { return DerivedActivityType; } - explicit TUploadRowsBase(TDuration timeout = TDuration::Max(), bool diskQuotaExceeded = false) + explicit TUploadRowsBase(TDuration timeout = TDuration::Max(), bool diskQuotaExceeded = false, NWilson::TSpan span = {}) : TBase() , SchemeCache(MakeSchemeCacheID()) , LeaderPipeCache(MakePipePeNodeCacheID(false)) , Timeout((timeout && timeout <= DEFAULT_TIMEOUT) ? timeout : DEFAULT_TIMEOUT) , Status(Ydb::StatusIds::SUCCESS) , DiskQuotaExceeded(diskQuotaExceeded) + , Span(std::move(span)) {} void Bootstrap(const NActors::TActorContext& ctx) { @@ -232,10 +236,10 @@ class TUploadRowsBase : public TActorBootstrappedResultSet.emplace_back(entry); - ctx.Send(SchemeCache, new TEvTxProxySchemeCache::TEvNavigateKeySet(request)); + ctx.Send(SchemeCache, new TEvTxProxySchemeCache::TEvNavigateKeySet(request), 0, 0, Span.GetTraceId()); TimeoutTimerActorId = CreateLongTimer(ctx, Timeout, new IEventHandle(ctx.SelfID, ctx.SelfID, new TEvents::TEvWakeup())); @@ -743,7 +748,7 @@ class TUploadRowsBase : public TActorBootstrappedResultSet.emplace_back(std::move(keyRange)); TAutoPtr resolveReq(new TEvTxProxySchemeCache::TEvResolveKeySet(request)); - ctx.Send(SchemeCache, resolveReq.Release()); + ctx.Send(SchemeCache, resolveReq.Release(), 0, 0, Span.GetTraceId()); TBase::Become(&TThis::StateWaitResolveShards); } @@ -1027,7 +1032,7 @@ class TUploadRowsBase : public TActorBootstrappedRecord.SetOverloadSubscribe(seqNo); state->SentOverloadSeqNo = seqNo; - ctx.Send(LeaderPipeCache, new TEvPipeCache::TEvForward(ev.release(), shardId, true), IEventHandle::FlagTrackDelivery); + ctx.Send(LeaderPipeCache, new TEvPipeCache::TEvForward(ev.release(), shardId, true), IEventHandle::FlagTrackDelivery, 0, Span.GetTraceId()); } void MakeShardRequests(const NActors::TActorContext& ctx) { @@ -1109,7 +1114,7 @@ class TUploadRowsBase : public TActorBootstrappedRecord.SetOverloadSubscribe(seqNo); uploadRetryStates[idx]->SentOverloadSeqNo = seqNo; - ctx.Send(LeaderPipeCache, new TEvPipeCache::TEvForward(ev.release(), shardId, true), IEventHandle::FlagTrackDelivery); + ctx.Send(LeaderPipeCache, new TEvPipeCache::TEvForward(ev.release(), shardId, true), IEventHandle::FlagTrackDelivery, 0, Span.GetTraceId()); auto res = ShardRepliesLeft.insert(shardId); if (!res.second) { @@ -1133,7 +1138,7 @@ class TUploadRowsBase : public TActorBootstrappedTableId, TActorId())); + ctx.Send(SchemeCache, new TEvTxProxySchemeCache::TEvInvalidateTable(GetKeyRange()->TableId, TActorId()), 0, 0, Span.GetTraceId()); SetError(Ydb::StatusIds::UNAVAILABLE, Sprintf("Failed to connect to shard %" PRIu64, ev->Get()->TabletId)); ShardRepliesLeft.erase(ev->Get()->TabletId); @@ -1170,7 +1175,7 @@ class TUploadRowsBase : public TActorBootstrappedTableId, TActorId())); + ctx.Send(SchemeCache, new TEvTxProxySchemeCache::TEvInvalidateTable(GetKeyRange()->TableId, TActorId()), 0, 0, Span.GetTraceId()); status = Ydb::StatusIds::OVERLOADED; break; case NKikimrTxDataShard::TError::DISK_SPACE_EXHAUSTED: @@ -1203,7 +1208,7 @@ class TUploadRowsBase : public TActorBootstrapped +#include + +namespace NWilson { + +inline void EndSpanWithStatus(NWilson::TSpan& span, Ydb::StatusIds::StatusCode statusCode) { + if (statusCode == Ydb::StatusIds::SUCCESS) { + span.EndOk(); + } else { + span.EndError(Ydb::StatusIds_StatusCode_Name(statusCode)); + } +} + +} // namespace NWilson diff --git a/ydb/core/util/ya.make b/ydb/core/util/ya.make index b321355669a9..abcd51fb3586 100644 --- a/ydb/core/util/ya.make +++ b/ydb/core/util/ya.make @@ -60,12 +60,14 @@ SRCS( ui64id.cpp ui64id.h wildcard.h + wilson.h ) PEERDIR( ydb/library/actors/core ydb/library/actors/interconnect/mock ydb/library/actors/util + ydb/library/actors/wilson library/cpp/containers/stack_vector library/cpp/html/escape library/cpp/ipmath diff --git a/ydb/core/ya.make b/ydb/core/ya.make index ddc2239a6b06..448497f2b4cd 100644 --- a/ydb/core/ya.make +++ b/ydb/core/ya.make @@ -60,6 +60,7 @@ RECURSE( ymq driver_lib yql_testlib + jaeger_tracing ) RECURSE_FOR_TESTS( diff --git a/ydb/library/actors/wilson/wilson_span.cpp b/ydb/library/actors/wilson/wilson_span.cpp index ac66ac2ef002..233397a9abf2 100644 --- a/ydb/library/actors/wilson/wilson_span.cpp +++ b/ydb/library/actors/wilson/wilson_span.cpp @@ -1,5 +1,6 @@ #include "wilson_span.h" #include "wilson_uploader.h" +#include #include namespace NWilson { @@ -62,6 +63,19 @@ namespace NWilson { Data->Sent = true; } + TSpan& TSpan::operator=(TSpan&& other) { + if (this != &other) { + if (Y_UNLIKELY(*this)) { + TStringStream err; + err << "TSpan instance incorrectly overwritten at:\n"; + FormatBackTrace(&err); + EndError(std::move(err.Str())); + } + Data = std::exchange(other.Data, nullptr); + } + return *this; + } + const TSpan TSpan::Empty; } // NWilson diff --git a/ydb/library/actors/wilson/wilson_span.h b/ydb/library/actors/wilson/wilson_span.h index d8bd9116c6f3..20170a896ea7 100644 --- a/ydb/library/actors/wilson/wilson_span.h +++ b/ydb/library/actors/wilson/wilson_span.h @@ -126,16 +126,7 @@ namespace NWilson { } TSpan& operator =(const TSpan&) = delete; - - TSpan& operator =(TSpan&& other) { - if (this != &other) { - if (Y_UNLIKELY(*this)) { - EndError("TSpan instance incorrectly overwritten"); - } - Data = std::exchange(other.Data, nullptr); - } - return *this; - } + TSpan& operator=(TSpan&& other); explicit operator bool() const { return Data && !Data->Sent && !Data->Ignored; diff --git a/ydb/library/actors/wilson/wilson_trace.cpp b/ydb/library/actors/wilson/wilson_trace.cpp index ec244856bc43..28bdb7efa801 100644 --- a/ydb/library/actors/wilson/wilson_trace.cpp +++ b/ydb/library/actors/wilson/wilson_trace.cpp @@ -4,7 +4,7 @@ #include namespace NWilson { - TTraceId TTraceId::FromTraceparentHeader(const TStringBuf header) { + TTraceId TTraceId::FromTraceparentHeader(const TStringBuf header, ui8 verbosity) { constexpr size_t versionChars = 2; // Only version 0 is supported constexpr size_t versionStart = 0; @@ -61,7 +61,7 @@ namespace NWilson { return {}; } - return TTraceId(traceId, spanId, 15, Max()); + return TTraceId(traceId, spanId, verbosity, Max()); } TString TTraceId::GetHexTraceId() const { diff --git a/ydb/library/actors/wilson/wilson_trace.h b/ydb/library/actors/wilson/wilson_trace.h index 0db42776f922..841a4b9e40be 100644 --- a/ydb/library/actors/wilson/wilson_trace.h +++ b/ydb/library/actors/wilson/wilson_trace.h @@ -181,7 +181,7 @@ namespace NWilson { return TTraceId(); } - static TTraceId FromTraceparentHeader(const TStringBuf header); + static TTraceId FromTraceparentHeader(const TStringBuf header, ui8 verbosity = 15); TTraceId Span(ui8 verbosity) const { Validate(); diff --git a/ydb/library/actors/wilson/wilson_uploader.cpp b/ydb/library/actors/wilson/wilson_uploader.cpp index 778c2b24f772..938f856e75dd 100644 --- a/ydb/library/actors/wilson/wilson_uploader.cpp +++ b/ydb/library/actors/wilson/wilson_uploader.cpp @@ -1,12 +1,16 @@ #include "wilson_uploader.h" + #include #include #include #include #include +#include #include #include + #include +#include namespace NWilson { @@ -17,14 +21,92 @@ namespace NWilson { namespace { + struct TSpan { + TMonotonic ExpirationTimestamp; + NTraceProto::Span Span; + size_t Size; + }; + + class TBatch { + private: + ui64 MaxSpansInBatch; + ui64 MaxBytesInBatch; + + NServiceProto::ExportTraceServiceRequest Request; + NTraceProto::ScopeSpans* ScopeSpans; + ui64 SizeBytes = 0; + TMonotonic ExpirationTimestamp = TMonotonic::Zero(); + + public: + struct TData { + NServiceProto::ExportTraceServiceRequest Request; + ui64 SizeBytes; + ui64 SizeSpans; + TMonotonic ExpirationTimestamp; + }; + + TBatch(ui64 maxSpansInBatch, ui64 maxBytesInBatch, TString serviceName) + : MaxSpansInBatch(maxSpansInBatch) + , MaxBytesInBatch(maxBytesInBatch) + { + auto *rspan = Request.add_resource_spans(); + auto *serviceNameAttr = rspan->mutable_resource()->add_attributes(); + serviceNameAttr->set_key("service.name"); + serviceNameAttr->mutable_value()->set_string_value(std::move(serviceName)); + ScopeSpans = rspan->add_scope_spans(); + } + + size_t SizeSpans() const { + return ScopeSpans->spansSize(); + } + + bool IsEmpty() const { + return SizeSpans() == 0; + } + + bool Add(TSpan& span) { + if (SizeBytes + span.Size > MaxBytesInBatch || SizeSpans() == MaxSpansInBatch) { + return false; + } + SizeBytes += span.Size; + span.Span.Swap(ScopeSpans->add_spans()); + ExpirationTimestamp = span.ExpirationTimestamp; + return true; + } + + TData Complete() && { + return TData { + .Request = std::move(Request), + .SizeBytes = SizeBytes, + .SizeSpans = SizeSpans(), + .ExpirationTimestamp = ExpirationTimestamp, + }; + } + }; + + struct TExportRequestData : TIntrusiveListItem { + std::unique_ptr Context; + std::unique_ptr> Reader; + grpc::Status Status; + NServiceProto::ExportTraceServiceResponse Response; + }; + class TWilsonUploader : public TActorBootstrapped { static constexpr size_t WILSON_SERVICE_ID = 430; - TString Host; - ui16 Port; - TString RootCA; + ui64 MaxPendingSpanBytes = 100'000'000; + ui64 MaxSpansPerSecond; + ui64 MaxSpansInBatch; + ui64 MaxBytesInBatch; + TDuration MaxBatchAccumulation = TDuration::Seconds(1); + TDuration MaxSpanTimeInQueue; + ui64 MaxExportInflight; + + bool WakeupScheduled = false; + + TString CollectorUrl; TString ServiceName; std::shared_ptr Channel; @@ -32,33 +114,30 @@ namespace NWilson { grpc::CompletionQueue CQ; std::unique_ptr GrpcSigner; - std::unique_ptr Context; - std::unique_ptr> Reader; - NServiceProto::ExportTraceServiceResponse Response; - grpc::Status Status; - - struct TSpanQueueItem { - TMonotonic ExpirationTimestamp; - NTraceProto::Span Span; - ui32 Size; - }; - std::deque Spans; - ui64 SpansSize = 0; + TBatch CurrentBatch; + std::queue BatchQueue; + ui64 SpansSizeBytes = 0; TMonotonic NextSendTimestamp; - ui32 MaxSpansAtOnce = 25; - ui32 MaxSpansPerSecond = 10; - TDuration MaxSpanTimeInQueue = TDuration::Seconds(60); - bool WakeupScheduled = false; + bool BatchCompletionScheduled = false; + TMonotonic NextBatchCompletion; + + TIntrusiveListWithAutoDelete ExportRequests; + size_t ExportRequestsCount = 0; public: - TWilsonUploader(WilsonUploaderParams params) - : Host(std::move(params.Host)) - , Port(std::move(params.Port)) - , RootCA(std::move(params.RootCA)) + TWilsonUploader(TWilsonUploaderParams params) + : MaxSpansPerSecond(params.MaxExportedSpansPerSecond) + , MaxSpansInBatch(params.MaxSpansInBatch) + , MaxBytesInBatch(params.MaxBytesInBatch) + , MaxBatchAccumulation(TDuration::MilliSeconds(params.MaxBatchAccumulationMilliseconds)) + , MaxSpanTimeInQueue(TDuration::Seconds(params.SpanExportTimeoutSeconds)) + , MaxExportInflight(params.MaxExportRequestsInflight) + , CollectorUrl(std::move(params.CollectorUrl)) , ServiceName(std::move(params.ServiceName)) , GrpcSigner(std::move(params.GrpcSigner)) + , CurrentBatch(MaxSpansInBatch, MaxBytesInBatch, ServiceName) {} ~TWilsonUploader() { @@ -68,139 +147,227 @@ namespace NWilson { static constexpr char ActorName[] = "WILSON_UPLOADER_ACTOR"; void Bootstrap() { - Become(&TThis::StateFunc); + Become(&TThis::StateWork); + + if (MaxSpansPerSecond == 0) { + ALOG_WARN(WILSON_SERVICE_ID, "max_spans_per_second should be greater than 0, changing to 1"); + MaxSpansPerSecond = 1; + } + if (MaxSpansInBatch == 0) { + ALOG_WARN(WILSON_SERVICE_ID, "max_spans_in_batch shold be greater than 0, changing to 1"); + MaxSpansInBatch = 1; + } + if (MaxExportInflight == 0) { + ALOG_WARN(WILSON_SERVICE_ID, "max_span_export_inflight should be greater than 0, changing to 1"); + MaxExportInflight = 1; + } - Channel = grpc::CreateChannel(TStringBuilder() << Host << ":" << Port, RootCA ? grpc::SslCredentials({ - .pem_root_certs = TFileInput(RootCA).ReadAll(), - }) : grpc::InsecureChannelCredentials()); + TStringBuf scheme; + TStringBuf host; + ui16 port; + if (!TryGetSchemeHostAndPort(CollectorUrl, scheme, host, port)) { + ALOG_ERROR(WILSON_SERVICE_ID, "Failed to parse collector url (" << CollectorUrl << " was provided). Wilson wouldn't work"); + Become(&TThis::StateBroken); + return; + } else if (scheme != "grpc://" && scheme != "grpcs://") { + ALOG_ERROR(WILSON_SERVICE_ID, "Wrong scheme provided: " << scheme << " (only grpc:// and grpcs:// are supported). Wilson wouldn't work"); + Become(&TThis::StateBroken); + return; + } + Channel = grpc::CreateChannel(TStringBuilder() << host << ":" << port, + scheme == "grpcs://" ? grpc::SslCredentials({}) : grpc::InsecureChannelCredentials()); Stub = NServiceProto::TraceService::NewStub(Channel); - LOG_INFO_S(*TlsActivationContext, WILSON_SERVICE_ID, "TWilsonUploader::Bootstrap"); + ALOG_INFO(WILSON_SERVICE_ID, "TWilsonUploader::Bootstrap"); } void Handle(TEvWilson::TPtr ev) { - if (SpansSize >= 100'000'000) { - LOG_ERROR_S(*TlsActivationContext, WILSON_SERVICE_ID, "dropped span due to overflow"); + if (SpansSizeBytes >= MaxPendingSpanBytes) { + ALOG_ERROR(WILSON_SERVICE_ID, "dropped span due to overflow"); } else { - const TMonotonic expirationTimestamp = TActivationContext::Monotonic() + MaxSpanTimeInQueue; + const TMonotonic now = TActivationContext::Monotonic(); + const TMonotonic expirationTimestamp = now + MaxSpanTimeInQueue; auto& span = ev->Get()->Span; const ui32 size = span.ByteSizeLong(); - Spans.push_back(TSpanQueueItem{expirationTimestamp, std::move(span), size}); - SpansSize += size; + if (size > MaxBytesInBatch) { + ALOG_ERROR(WILSON_SERVICE_ID, "dropped span of size " << size << ", which exceeds max batch size " << MaxBytesInBatch); + return; + } + TSpan spanItem { + .ExpirationTimestamp = expirationTimestamp, + .Span = std::move(span), + .Size = size, + }; + SpansSizeBytes += size; + if (CurrentBatch.IsEmpty()) { + ScheduleBatchCompletion(now); + } + if (CurrentBatch.Add(spanItem)) { + return; + } + CompleteCurrentBatch(); TryMakeProgress(); + Y_ABORT_UNLESS(CurrentBatch.Add(spanItem), "failed to add span to empty batch"); + ScheduleBatchCompletion(now); + } + } + + void ScheduleBatchCompletionEvent() { + Y_ABORT_UNLESS(!BatchCompletionScheduled); + auto cookie = NextBatchCompletion.GetValue(); + TActivationContext::Schedule(NextBatchCompletion, new IEventHandle(TEvents::TSystem::Wakeup, 0, SelfId(), {}, nullptr, cookie)); + ALOG_TRACE(WILSON_SERVICE_ID, "scheduling batch completion w/ cookie=" << cookie); + BatchCompletionScheduled = true; + } + + void ScheduleBatchCompletion(TMonotonic now) { + NextBatchCompletion = now + MaxBatchAccumulation; + if (!BatchCompletionScheduled) { + ScheduleBatchCompletionEvent(); + } + } + + void CompleteCurrentBatch() { + if (CurrentBatch.IsEmpty()) { + return; } + BatchQueue.push(std::move(CurrentBatch).Complete()); + CurrentBatch = TBatch(MaxSpansInBatch, MaxBytesInBatch, ServiceName); } void TryToSend() { const TMonotonic now = TActivationContext::Monotonic(); ui32 numSpansDropped = 0; - while (!Spans.empty()) { - const TSpanQueueItem& item = Spans.front(); + while (!BatchQueue.empty()) { + const TBatch::TData& item = BatchQueue.front(); if (item.ExpirationTimestamp <= now) { - SpansSize -= item.Size; - Spans.pop_front(); - ++numSpansDropped; + SpansSizeBytes -= item.SizeBytes; + numSpansDropped += item.SizeSpans; + BatchQueue.pop(); } else { break; } } if (numSpansDropped) { - LOG_ERROR_S(*TlsActivationContext, WILSON_SERVICE_ID, + ALOG_ERROR(WILSON_SERVICE_ID, "dropped " << numSpansDropped << " span(s) due to expiration"); } - if (Context || Spans.empty()) { + if (ExportRequestsCount >= MaxExportInflight || BatchQueue.empty()) { return; } else if (now < NextSendTimestamp) { ScheduleWakeup(NextSendTimestamp); return; } - NServiceProto::ExportTraceServiceRequest request; - auto *rspan = request.add_resource_spans(); - auto *serviceNameAttr = rspan->mutable_resource()->add_attributes(); - serviceNameAttr->set_key("service.name"); - serviceNameAttr->mutable_value()->set_string_value(ServiceName); - auto *sspan = rspan->add_scope_spans(); - - NextSendTimestamp = now; - for (ui32 i = 0; i < MaxSpansAtOnce && !Spans.empty(); ++i, Spans.pop_front()) { - auto& item = Spans.front(); - auto& s = item.Span; - - LOG_DEBUG_S(*TlsActivationContext, WILSON_SERVICE_ID, "exporting span" - << " TraceId# " << HexEncode(s.trace_id()) - << " SpanId# " << HexEncode(s.span_id()) - << " ParentSpanId# " << HexEncode(s.parent_span_id()) - << " Name# " << s.name()); - - SpansSize -= item.Size; - s.Swap(sspan->add_spans()); - NextSendTimestamp += TDuration::MicroSeconds(1'000'000 / MaxSpansPerSecond); + TBatch::TData batch = std::move(BatchQueue.front()); + BatchQueue.pop(); + + ALOG_DEBUG(WILSON_SERVICE_ID, "exporting batch of " << batch.SizeSpans << " spans, total spans size: " << batch.SizeBytes); + Y_ABORT_UNLESS(batch.Request.resource_spansSize() == 1 && batch.Request.resource_spans(0).scope_spansSize() == 1); + for (const auto& span : batch.Request.resource_spans(0).scope_spans(0).spans()) { + ALOG_DEBUG(WILSON_SERVICE_ID, "exporting span" + << " TraceId# " << HexEncode(span.trace_id()) + << " SpanId# " << HexEncode(span.span_id()) + << " ParentSpanId# " << HexEncode(span.parent_span_id()) + << " Name# " << span.name()); } + NextSendTimestamp = now + TDuration::MicroSeconds((batch.SizeSpans * 1'000'000) / MaxSpansPerSecond); + SpansSizeBytes -= batch.SizeBytes; + ScheduleWakeup(NextSendTimestamp); - Context = std::make_unique(); + + auto context = std::make_unique(); if (GrpcSigner) { - GrpcSigner->SignClientContext(*Context); + GrpcSigner->SignClientContext(*context); } - Reader = Stub->AsyncExport(Context.get(), std::move(request), &CQ); - Reader->Finish(&Response, &Status, nullptr); + auto reader = Stub->AsyncExport(context.get(), std::move(batch.Request), &CQ); + auto uploadData = std::unique_ptr(new TExportRequestData { + .Context = std::move(context), + .Reader = std::move(reader), + }); + uploadData->Reader->Finish(&uploadData->Response, &uploadData->Status, uploadData.get()); + ALOG_TRACE(WILSON_SERVICE_ID, "started export request " << (void*)uploadData.get()); + ExportRequests.PushBack(uploadData.release()); + ++ExportRequestsCount; } - void CheckIfDone() { - if (Context) { - void *tag; - bool ok; - if (CQ.AsyncNext(&tag, &ok, std::chrono::system_clock::now()) == grpc::CompletionQueue::GOT_EVENT) { - if (!Status.ok()) { - LOG_ERROR_S(*TlsActivationContext, WILSON_SERVICE_ID, - "failed to commit traces: " << Status.error_message()); - } - - Reader.reset(); - Context.reset(); - } else { - ScheduleWakeup(TDuration::MilliSeconds(100)); + void ReapCompletedRequests() { + if (ExportRequests.Empty()) { + return; + } + void* tag; + bool ok; + while (CQ.AsyncNext(&tag, &ok, std::chrono::system_clock::now()) == grpc::CompletionQueue::GOT_EVENT) { + auto node = std::unique_ptr(static_cast(tag)); + ALOG_TRACE(WILSON_SERVICE_ID, "finished export request " << (void*)node.get()); + if (!node->Status.ok()) { + ALOG_ERROR(WILSON_SERVICE_ID, + "failed to commit traces: " << node->Status.error_message()); } + + --ExportRequestsCount; + node->Unlink(); + } + + if (!ExportRequests.Empty()) { + ScheduleWakeup(TDuration::MilliSeconds(100)); } } template void ScheduleWakeup(T&& deadline) { if (!WakeupScheduled) { - TActivationContext::Schedule(deadline, new IEventHandle(TEvents::TSystem::Wakeup, 0, SelfId(), {}, - nullptr, 0)); + TActivationContext::Schedule(deadline, + new IEventHandle(TEvents::TSystem::Wakeup, 0, + SelfId(), {}, nullptr, 0)); WakeupScheduled = true; } } - void HandleWakeup() { - Y_ABORT_UNLESS(WakeupScheduled); - WakeupScheduled = false; + void HandleWakeup(TEvents::TEvWakeup::TPtr& ev) { + const auto cookie = ev->Cookie; + ALOG_TRACE(WILSON_SERVICE_ID, "wakeup received w/ cookie=" << cookie); + if (cookie == 0) { + Y_ABORT_UNLESS(WakeupScheduled); + WakeupScheduled = false; + } else { + Y_ABORT_UNLESS(BatchCompletionScheduled); + BatchCompletionScheduled = false; + if (cookie == NextBatchCompletion.GetValue()) { + CompleteCurrentBatch(); + } else { + ScheduleBatchCompletionEvent(); + } + } TryMakeProgress(); } void TryMakeProgress() { - CheckIfDone(); + ReapCompletedRequests(); TryToSend(); } - STRICT_STFUNC(StateFunc, + STRICT_STFUNC(StateWork, hFunc(TEvWilson, Handle); - cFunc(TEvents::TSystem::Wakeup, HandleWakeup); + hFunc(TEvents::TEvWakeup, HandleWakeup); + ); + + STRICT_STFUNC(StateBroken, + IgnoreFunc(TEvWilson); ); }; } // anonymous - IActor* CreateWilsonUploader(WilsonUploaderParams params) { + IActor* CreateWilsonUploader(TWilsonUploaderParams params) { return new TWilsonUploader(std::move(params)); } - IActor* WilsonUploaderParams::CreateUploader() && { + IActor* TWilsonUploaderParams::CreateUploader() && { return CreateWilsonUploader(std::move(*this)); } diff --git a/ydb/library/actors/wilson/wilson_uploader.h b/ydb/library/actors/wilson/wilson_uploader.h index f664292659eb..3ba1f6109d4b 100644 --- a/ydb/library/actors/wilson/wilson_uploader.h +++ b/ydb/library/actors/wilson/wilson_uploader.h @@ -25,16 +25,21 @@ namespace NWilson { return NActors::TActorId(0, TStringBuf("WilsonUpload", 12)); } - struct WilsonUploaderParams { - TString Host; - ui16 Port; - TString RootCA; + struct TWilsonUploaderParams { + TString CollectorUrl; TString ServiceName; std::unique_ptr GrpcSigner; + ui64 MaxExportedSpansPerSecond = Max(); + ui64 MaxSpansInBatch = 150; + ui64 MaxBytesInBatch = 20'000'000; + ui64 MaxBatchAccumulationMilliseconds = 1'000; + ui32 SpanExportTimeoutSeconds = 60 * 60 * 24 * 365; + ui64 MaxExportRequestsInflight = 1; + NActors::IActor* CreateUploader() &&; }; - NActors::IActor* CreateWilsonUploader(WilsonUploaderParams params); + NActors::IActor* CreateWilsonUploader(TWilsonUploaderParams params); } // NWilson diff --git a/ydb/library/services/services.proto b/ydb/library/services/services.proto index 92c8eccd3357..41b130bf408b 100644 --- a/ydb/library/services/services.proto +++ b/ydb/library/services/services.proto @@ -1018,5 +1018,6 @@ message TActivity { GRAPH_SERVICE = 624; REPLICATION_WORKER = 625; SCHEMESHARD_BACKGROUND_CLEANING = 626; + JAEGER_TRACING_CONFIGURATOR = 627; }; }; diff --git a/ydb/library/wilson_ids/wilson.h b/ydb/library/wilson_ids/wilson.h index 32cfce06998b..3de4b9dc45df 100644 --- a/ydb/library/wilson_ids/wilson.h +++ b/ydb/library/wilson_ids/wilson.h @@ -1,60 +1,100 @@ #pragma once +#include + namespace NKikimr { + struct TComponentTracingLevels { +#ifdef DEFINE_TRACING_LEVELS +#error "Macro collision: DEFINE_TRACING_LEVELS" +#endif + +#define DEFINE_TRACING_LEVELS(COMPONENT, MINIMAL, BASIC, DETAILED, DIAGNOSTIC, TRACE) \ + struct COMPONENT { \ + enum : ui8 { \ + TopLevel = MINIMAL, \ + Basic = BASIC, \ + Detailed = DETAILED, \ + Diagnostic = DIAGNOSTIC, \ + Trace = TRACE, \ + }; \ + }; + + + DEFINE_TRACING_LEVELS(TGrpcProxy, 0, 5, 10, 14, 15) + DEFINE_TRACING_LEVELS(TQueryProcessor, 1, 6, 10, 14, 15) + DEFINE_TRACING_LEVELS(TDistributedTransactions, 2, 7, 11, 14, 15) + DEFINE_TRACING_LEVELS(TTablet, 3, 8, 12, 14, 15) + DEFINE_TRACING_LEVELS(TDistributedStorage, 4, 9, 13, 14, 15) + +#undef DEFINE_TRACING_LEVELS + + enum : ui8 { + // The most verbose detalisation level used in production + ProductionVerbose = 13, + }; + }; + struct TWilson { enum { - BlobStorage = 8, // DS proxy and lower levels - DsProxyInternals = 9, - VDiskTopLevel = 12, - VDiskInternals = 13, - PDisk = 14, + BlobStorage = TComponentTracingLevels::TDistributedStorage::TopLevel, + DsProxyInternals = TComponentTracingLevels::TDistributedStorage::Detailed, + VDiskTopLevel = TComponentTracingLevels::TDistributedStorage::Basic, + VDiskInternals = TComponentTracingLevels::TDistributedStorage::Detailed, + PDiskTopLevel = TComponentTracingLevels::TDistributedStorage::Basic, + PDiskBasic = TComponentTracingLevels::TDistributedStorage::Detailed, + PDiskDetailed = TComponentTracingLevels::TDistributedStorage::Detailed, }; }; struct TWilsonKqp { enum { - KqpSession = 8, - CompileService = 9, - CompileActor = 9, - SessionAcquireSnapshot = 9, + KqpSession = TComponentTracingLevels::TQueryProcessor::TopLevel, + CompileService = TComponentTracingLevels::TQueryProcessor::Basic, + CompileActor = TComponentTracingLevels::TQueryProcessor::Basic, + SessionAcquireSnapshot = TComponentTracingLevels::TQueryProcessor::Basic, - ExecuterTableResolve = 10, - ExecuterShardsResolve = 10, + ExecuterTableResolve = TComponentTracingLevels::TQueryProcessor::Detailed, + ExecuterShardsResolve = TComponentTracingLevels::TQueryProcessor::Detailed, - LiteralExecuter = 9, + LiteralExecuter = TComponentTracingLevels::TQueryProcessor::Basic, - DataExecuter = 9, - DataExecuterAcquireSnapshot = 10, - DataExecuterRunTasks = 10, + DataExecuter = TComponentTracingLevels::TQueryProcessor::Basic, + DataExecuterAcquireSnapshot = TComponentTracingLevels::TQueryProcessor::Detailed, + DataExecuterRunTasks = TComponentTracingLevels::TQueryProcessor::Detailed, - ScanExecuter = 9, - ScanExecuterRunTasks = 10, + ScanExecuter = TComponentTracingLevels::TQueryProcessor::Basic, + ScanExecuterRunTasks = TComponentTracingLevels::TQueryProcessor::Detailed, - KqpNodeSendTasks = 9, + KqpNodeSendTasks = TComponentTracingLevels::TQueryProcessor::Basic, - ProposeTransaction = 9, + ProposeTransaction = TComponentTracingLevels::TQueryProcessor::Basic, - ComputeActor = 9, + ComputeActor = TComponentTracingLevels::TQueryProcessor::Basic, - ReadActor = 9, - ReadActorShardsResolve = 10, + ReadActor = TComponentTracingLevels::TQueryProcessor::Basic, + ReadActorShardsResolve = TComponentTracingLevels::TQueryProcessor::Detailed, - LookupActor = 9, - LookupActorShardsResolve = 10, + LookupActor = TComponentTracingLevels::TQueryProcessor::Basic, + LookupActorShardsResolve = TComponentTracingLevels::TQueryProcessor::Detailed, + + BulkUpsertActor = TComponentTracingLevels::TQueryProcessor::TopLevel, }; }; struct TWilsonTablet { enum { - Tablet = 15 + TabletTopLevel = TComponentTracingLevels::TTablet::TopLevel, + TabletBasic = TComponentTracingLevels::TTablet::Basic, + TabletDetailed = TComponentTracingLevels::TTablet::Detailed, }; }; struct TWilsonGrpc { enum { - RequestProxy = 9, - RequestActor = 9, + RequestProxy = TComponentTracingLevels::TGrpcProxy::TopLevel, + RequestActor = TComponentTracingLevels::TGrpcProxy::TopLevel, + RequestCheckActor = TComponentTracingLevels::TGrpcProxy::Basic, }; }; diff --git a/ydb/services/keyvalue/grpc_service.cpp b/ydb/services/keyvalue/grpc_service.cpp index 595da30097cf..4cd1cc62eaab 100644 --- a/ydb/services/keyvalue/grpc_service.cpp +++ b/ydb/services/keyvalue/grpc_service.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace NKikimr::NGRpcService { @@ -40,7 +41,7 @@ void TKeyValueGRpcService::SetupIncomingRequests(NYdbGrpc::TLoggerPtr logger) { #error SETUP_METHOD macro collision #endif -#define SETUP_METHOD(methodName, method, rlMode) \ +#define SETUP_METHOD(methodName, method, rlMode, requestType) \ MakeIntrusiveSend(GRpcRequestProxyId, new TGrpcRequestOperationCall< \ Ydb::KeyValue::Y_CAT(methodName, Request), \ Ydb::KeyValue::Y_CAT(methodName, Response)>(reqCtx, &method, \ - TRequestAuxSettings{rlMode, nullptr})); \ + TRequestAuxSettings { \ + .RlMode = TRateLimiterMode::rlMode, \ + .RequestType = NJaegerTracing::ERequestType::requestType, \ + })); \ }, \ &Ydb::KeyValue::V1::KeyValueService::AsyncService::Y_CAT(Request, methodName), \ "KeyValue/" Y_STRINGIZE(methodName), \ @@ -62,18 +66,18 @@ void TKeyValueGRpcService::SetupIncomingRequests(NYdbGrpc::TLoggerPtr logger) { getCounterBlock("keyvalue", Y_STRINGIZE(methodName)) \ )->Run() - SETUP_METHOD(CreateVolume, DoCreateVolumeKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(DropVolume, DoDropVolumeKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(AlterVolume, DoAlterVolumeKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(DescribeVolume, DoDescribeVolumeKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(ListLocalPartitions, DoListLocalPartitionsKeyValue, TRateLimiterMode::Rps); - - SETUP_METHOD(AcquireLock, DoAcquireLockKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(ExecuteTransaction, DoExecuteTransactionKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(Read, DoReadKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(ReadRange, DoReadRangeKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(ListRange, DoListRangeKeyValue, TRateLimiterMode::Rps); - SETUP_METHOD(GetStorageChannelStatus, DoGetStorageChannelStatusKeyValue, TRateLimiterMode::Rps); + SETUP_METHOD(CreateVolume, DoCreateVolumeKeyValue, Rps, KEYVALUE_CREATEVOLUME); + SETUP_METHOD(DropVolume, DoDropVolumeKeyValue, Rps, KEYVALUE_DROPVOLUME); + SETUP_METHOD(AlterVolume, DoAlterVolumeKeyValue, Rps, KEYVALUE_ALTERVOLUME); + SETUP_METHOD(DescribeVolume, DoDescribeVolumeKeyValue, Rps, KEYVALUE_DESCRIBEVOLUME); + SETUP_METHOD(ListLocalPartitions, DoListLocalPartitionsKeyValue, Rps, KEYVALUE_LISTLOCALPARTITIONS); + + SETUP_METHOD(AcquireLock, DoAcquireLockKeyValue, Rps, KEYVALUE_ACQUIRELOCK); + SETUP_METHOD(ExecuteTransaction, DoExecuteTransactionKeyValue, Rps, KEYVALUE_EXECUTETRANSACTION); + SETUP_METHOD(Read, DoReadKeyValue, Rps, KEYVALUE_READ); + SETUP_METHOD(ReadRange, DoReadRangeKeyValue, Rps, KEYVALUE_READRANGE); + SETUP_METHOD(ListRange, DoListRangeKeyValue, Rps, KEYVALUE_LISTRANGE); + SETUP_METHOD(GetStorageChannelStatus, DoGetStorageChannelStatusKeyValue, Rps, KEYVALUE_GETSTORAGECHANNELSTATUS); #undef SETUP_METHOD } diff --git a/ydb/services/local_discovery/grpc_func_call.h b/ydb/services/local_discovery/grpc_func_call.h index 8c501bf6a585..4efe50d5662f 100644 --- a/ydb/services/local_discovery/grpc_func_call.h +++ b/ydb/services/local_discovery/grpc_func_call.h @@ -49,6 +49,14 @@ class TGrpcRequestFunctionCall return true; } } + + NJaegerTracing::TRequestDiscriminator GetRequestDiscriminator() const override { + return { + .RequestType = AuxSettings.RequestType, + .Database = TBase::GetDatabaseName(), + }; + } + private: TFuncCallback PassMethod; const TRequestAuxSettings AuxSettings; diff --git a/ydb/services/local_discovery/grpc_service.cpp b/ydb/services/local_discovery/grpc_service.cpp index 0615355a286a..76cf65491395 100644 --- a/ydb/services/local_discovery/grpc_service.cpp +++ b/ydb/services/local_discovery/grpc_service.cpp @@ -87,22 +87,25 @@ void TGRpcLocalDiscoveryService::SetupIncomingRequests(NYdbGrpc::TLoggerPtr logg #error macro already defined #endif -#define ADD_REQUEST(NAME, CB) \ +#define ADD_REQUEST(NAME, CB, REQUEST_TYPE) \ MakeIntrusive> \ (this, &Service_, CQ_, \ - [this](NYdbGrpc::IRequestContextBase *ctx) { \ + [this](NYdbGrpc::IRequestContextBase *ctx) { \ NGRpcService::ReportGrpcReqToMon(*ActorSystem_, ctx->GetPeer(), GetSdkBuildInfo(ctx)); \ ActorSystem_->Send(GRpcRequestProxyId_, \ new TGrpcRequestOperationCall \ - (ctx, CB, TRequestAuxSettings{TRateLimiterMode::Rps, nullptr})); \ + (ctx, CB, TRequestAuxSettings { \ + .RlMode = TRateLimiterMode::Rps, \ + .RequestType = NJaegerTracing::ERequestType::DISCOVERY_##REQUEST_TYPE, \ + })); \ }, &Ydb::Discovery::V1::DiscoveryService::AsyncService::Request ## NAME, \ #NAME, logger, getCounterBlock("discovery", #NAME))->Run(); - ADD_REQUEST(WhoAmI, &DoWhoAmIRequest) + ADD_REQUEST(WhoAmI, &DoWhoAmIRequest, WHOAMI) NodeRegistrationRequest = [authParams = this->DynamicNodeAuthorizationParams] (std::unique_ptr p, const IFacilityProvider& f) { DoNodeRegistrationRequest(std::move(p), f, authParams); }; - ADD_REQUEST(NodeRegistration, NodeRegistrationRequest) + ADD_REQUEST(NodeRegistration, NodeRegistrationRequest, NODEREGISTRATION) #undef ADD_REQUEST using namespace std::placeholders; @@ -111,19 +114,22 @@ using namespace std::placeholders; #error macro already defined #endif -#define ADD_METHOD(NAME, METHOD) \ +#define ADD_METHOD(NAME, METHOD, REQUEST_TYPE) \ MakeIntrusive> \ (this, &Service_, CQ_, \ - [this](NYdbGrpc::IRequestContextBase *ctx) { \ + [this](NYdbGrpc::IRequestContextBase *ctx) { \ NGRpcService::ReportGrpcReqToMon(*ActorSystem_, ctx->GetPeer(), GetSdkBuildInfo(ctx)); \ TFuncCallback cb = std::bind(&TGRpcLocalDiscoveryService::METHOD, this, _1, _2); \ ActorSystem_->Send(GRpcRequestProxyId_, \ new TGrpcRequestFunctionCall \ - (ctx, cb, TRequestAuxSettings{TRateLimiterMode::Rps, nullptr})); \ + (ctx, cb, TRequestAuxSettings { \ + .RlMode = TRateLimiterMode::Rps, \ + .RequestType = NJaegerTracing::ERequestType::DISCOVERY_##REQUEST_TYPE, \ + })); \ }, &Ydb::Discovery::V1::DiscoveryService::AsyncService::Request ## NAME, \ #NAME, logger, getCounterBlock("discovery", #NAME))->Run(); - ADD_METHOD(ListEndpoints, DoListEndpointsRequest) + ADD_METHOD(ListEndpoints, DoListEndpointsRequest, LISTENDPOINTS) #undef ADD_METHOD } diff --git a/ydb/services/ydb/ydb_query.cpp b/ydb/services/ydb/ydb_query.cpp index 760599e68ccc..b8bc700e89f3 100644 --- a/ydb/services/ydb/ydb_query.cpp +++ b/ydb/services/ydb/ydb_query.cpp @@ -37,7 +37,7 @@ void TGRpcYdbQueryService::SetupIncomingRequests(NYdbGrpc::TLoggerPtr logger) { #ifdef ADD_REQUEST #error ADD_REQUEST macro already defined #endif -#define ADD_REQUEST(NAME, IN, OUT, CB, ...) \ +#define ADD_REQUEST(NAME, IN, OUT, CB, REQUEST_TYPE, ...) \ for (size_t i = 0; i < HandlersPerCompletionQueue; ++i) { \ for (auto* cq: CQS) { \ MakeIntrusive>(this, &Service_, cq, \ @@ -45,22 +45,26 @@ void TGRpcYdbQueryService::SetupIncomingRequests(NYdbGrpc::TLoggerPtr logger) { NGRpcService::ReportGrpcReqToMon(*ActorSystem_, ctx->GetPeer()); \ ActorSystem_->Send(GRpcProxies_[proxyCounter % GRpcProxies_.size()], \ new TGrpcRequestNoOperationCall \ - (ctx, &CB, TRequestAuxSettings{RLSWITCH(TRateLimiterMode::Rps), nullptr __VA_OPT__(, TAuditMode::__VA_ARGS__)})); \ + (ctx, &CB, TRequestAuxSettings { \ + .RlMode = RLSWITCH(TRateLimiterMode::Rps), \ + __VA_OPT__(.AuditMode = TAuditMode::__VA_ARGS__,) \ + .RequestType = NJaegerTracing::ERequestType::QUERY_##REQUEST_TYPE, \ + })); \ }, &Ydb::Query::V1::QueryService::AsyncService::Request ## NAME, \ #NAME, logger, getCounterBlock("query", #NAME))->Run(); \ ++proxyCounter; \ } \ } - ADD_REQUEST(ExecuteQuery, ExecuteQueryRequest, ExecuteQueryResponsePart, DoExecuteQuery, Auditable); - ADD_REQUEST(ExecuteScript, ExecuteScriptRequest, Ydb::Operations::Operation, DoExecuteScript, Auditable); - ADD_REQUEST(FetchScriptResults, FetchScriptResultsRequest, FetchScriptResultsResponse, DoFetchScriptResults); - ADD_REQUEST(CreateSession, CreateSessionRequest, CreateSessionResponse, DoCreateSession); - ADD_REQUEST(DeleteSession, DeleteSessionRequest, DeleteSessionResponse, DoDeleteSession); - ADD_REQUEST(AttachSession, AttachSessionRequest, SessionState, DoAttachSession); - ADD_REQUEST(BeginTransaction, BeginTransactionRequest, BeginTransactionResponse, DoBeginTransaction); - ADD_REQUEST(CommitTransaction, CommitTransactionRequest, CommitTransactionResponse, DoCommitTransaction); - ADD_REQUEST(RollbackTransaction, RollbackTransactionRequest, RollbackTransactionResponse, DoRollbackTransaction); + ADD_REQUEST(ExecuteQuery, ExecuteQueryRequest, ExecuteQueryResponsePart, DoExecuteQuery, EXECUTEQUERY, Auditable); + ADD_REQUEST(ExecuteScript, ExecuteScriptRequest, Ydb::Operations::Operation, DoExecuteScript, EXECUTESCRIPT, Auditable); + ADD_REQUEST(FetchScriptResults, FetchScriptResultsRequest, FetchScriptResultsResponse, DoFetchScriptResults, FETCHSCRIPTRESULTS); + ADD_REQUEST(CreateSession, CreateSessionRequest, CreateSessionResponse, DoCreateSession, CREATESESSION); + ADD_REQUEST(DeleteSession, DeleteSessionRequest, DeleteSessionResponse, DoDeleteSession, DELETESESSION); + ADD_REQUEST(AttachSession, AttachSessionRequest, SessionState, DoAttachSession, ATTACHSESSION); + ADD_REQUEST(BeginTransaction, BeginTransactionRequest, BeginTransactionResponse, DoBeginTransaction, BEGINTRANSACTION); + ADD_REQUEST(CommitTransaction, CommitTransactionRequest, CommitTransactionResponse, DoCommitTransaction, COMMITTRANSACTION); + ADD_REQUEST(RollbackTransaction, RollbackTransactionRequest, RollbackTransactionResponse, DoRollbackTransaction, ROLLBACKTRANSACTION); #undef ADD_REQUEST } diff --git a/ydb/services/ydb/ydb_table.cpp b/ydb/services/ydb/ydb_table.cpp index 9324baf4d1c2..37067b16e358 100644 --- a/ydb/services/ydb/ydb_table.cpp +++ b/ydb/services/ydb/ydb_table.cpp @@ -40,63 +40,70 @@ void TGRpcYdbTableService::SetupIncomingRequests(NYdbGrpc::TLoggerPtr logger) { #error ADD_STREAM_REQUEST_LIMIT macro already defined #endif -#define ADD_REQUEST_LIMIT(NAME, CB, LIMIT_TYPE, ...) \ +#define ADD_REQUEST_LIMIT(NAME, CB, LIMIT_TYPE, REQUEST_TYPE, ...) \ for (size_t i = 0; i < HandlersPerCompletionQueue; ++i) { \ for (auto* cq: CQS) { \ MakeIntrusive> \ (this, &Service_, cq, \ - [this, proxyCounter](NYdbGrpc::IRequestContextBase *ctx) { \ + [this, proxyCounter](NYdbGrpc::IRequestContextBase *ctx) { \ NGRpcService::ReportGrpcReqToMon(*ActorSystem_, ctx->GetPeer()); \ ActorSystem_->Send(GRpcProxies_[proxyCounter % GRpcProxies_.size()], \ new TGrpcRequestOperationCall \ - (ctx, &CB, TRequestAuxSettings{RLSWITCH(TRateLimiterMode::LIMIT_TYPE), nullptr __VA_OPT__(, TAuditMode::__VA_ARGS__)})); \ + (ctx, &CB, TRequestAuxSettings { \ + .RlMode = RLSWITCH(TRateLimiterMode::LIMIT_TYPE), \ + __VA_OPT__(.AuditMode = TAuditMode::__VA_ARGS__,) \ + .RequestType = NJaegerTracing::ERequestType::TABLE_##REQUEST_TYPE, \ + })); \ }, &Ydb::Table::V1::TableService::AsyncService::Request ## NAME, \ #NAME, logger, getCounterBlock("table", #NAME))->Run(); \ ++proxyCounter; \ } \ } -#define ADD_STREAM_REQUEST_LIMIT(NAME, IN, OUT, CB, LIMIT_TYPE) \ +#define ADD_STREAM_REQUEST_LIMIT(NAME, IN, OUT, CB, LIMIT_TYPE, REQUEST_TYPE) \ for (size_t i = 0; i < HandlersPerCompletionQueue; ++i) { \ for (auto* cq: CQS) { \ MakeIntrusive> \ (this, &Service_, cq, \ - [this, proxyCounter](NYdbGrpc::IRequestContextBase *ctx) { \ + [this, proxyCounter](NYdbGrpc::IRequestContextBase *ctx) { \ NGRpcService::ReportGrpcReqToMon(*ActorSystem_, ctx->GetPeer()); \ ActorSystem_->Send(GRpcProxies_[proxyCounter % GRpcProxies_.size()], \ new TGrpcRequestNoOperationCall \ - (ctx, &CB, TRequestAuxSettings{RLSWITCH(TRateLimiterMode::LIMIT_TYPE), nullptr})); \ + (ctx, &CB, TRequestAuxSettings { \ + .RlMode = RLSWITCH(TRateLimiterMode::LIMIT_TYPE), \ + .RequestType = NJaegerTracing::ERequestType::TABLE_##REQUEST_TYPE, \ + })); \ }, &Ydb::Table::V1::TableService::AsyncService::Request ## NAME, \ #NAME, logger, getCounterBlock("table", #NAME))->Run(); \ ++proxyCounter; \ } \ } - ADD_REQUEST_LIMIT(CreateSession, DoCreateSessionRequest, Rps) - ADD_REQUEST_LIMIT(KeepAlive, DoKeepAliveRequest, Rps) - ADD_REQUEST_LIMIT(AlterTable, DoAlterTableRequest, Rps) - ADD_REQUEST_LIMIT(CreateTable, DoCreateTableRequest, Rps) - ADD_REQUEST_LIMIT(DropTable, DoDropTableRequest, Rps) - ADD_REQUEST_LIMIT(DescribeTable, DoDescribeTableRequest, Rps) - ADD_REQUEST_LIMIT(CopyTable, DoCopyTableRequest, Rps) - ADD_REQUEST_LIMIT(CopyTables, DoCopyTablesRequest, Rps) - ADD_REQUEST_LIMIT(RenameTables, DoRenameTablesRequest, Rps) - ADD_REQUEST_LIMIT(ExplainDataQuery, DoExplainDataQueryRequest, Rps) - ADD_REQUEST_LIMIT(ExecuteSchemeQuery, DoExecuteSchemeQueryRequest, Rps) - ADD_REQUEST_LIMIT(BeginTransaction, DoBeginTransactionRequest, Rps, Auditable) - ADD_REQUEST_LIMIT(DescribeTableOptions, DoDescribeTableOptionsRequest, Rps) - - ADD_REQUEST_LIMIT(DeleteSession, DoDeleteSessionRequest, Off) - ADD_REQUEST_LIMIT(CommitTransaction, DoCommitTransactionRequest, Off, Auditable) - ADD_REQUEST_LIMIT(RollbackTransaction, DoRollbackTransactionRequest, Off, Auditable) - - ADD_REQUEST_LIMIT(PrepareDataQuery, DoPrepareDataQueryRequest, Ru, Auditable) - ADD_REQUEST_LIMIT(ExecuteDataQuery, DoExecuteDataQueryRequest, Ru, Auditable) - ADD_REQUEST_LIMIT(BulkUpsert, DoBulkUpsertRequest, Ru, Auditable) - - ADD_STREAM_REQUEST_LIMIT(StreamExecuteScanQuery, ExecuteScanQueryRequest, ExecuteScanQueryPartialResponse, DoExecuteScanQueryRequest, RuOnProgress) - ADD_STREAM_REQUEST_LIMIT(StreamReadTable, ReadTableRequest, ReadTableResponse, DoReadTableRequest, RuOnProgress) - ADD_STREAM_REQUEST_LIMIT(ReadRows, ReadRowsRequest, ReadRowsResponse, DoReadRowsRequest, Ru) + ADD_REQUEST_LIMIT(CreateSession, DoCreateSessionRequest, Rps, CREATESESSION) + ADD_REQUEST_LIMIT(KeepAlive, DoKeepAliveRequest, Rps, KEEPALIVE) + ADD_REQUEST_LIMIT(AlterTable, DoAlterTableRequest, Rps, ALTERTABLE) + ADD_REQUEST_LIMIT(CreateTable, DoCreateTableRequest, Rps, CREATETABLE) + ADD_REQUEST_LIMIT(DropTable, DoDropTableRequest, Rps, DROPTABLE) + ADD_REQUEST_LIMIT(DescribeTable, DoDescribeTableRequest, Rps, DESCRIBETABLE) + ADD_REQUEST_LIMIT(CopyTable, DoCopyTableRequest, Rps, COPYTABLE) + ADD_REQUEST_LIMIT(CopyTables, DoCopyTablesRequest, Rps, COPYTABLES) + ADD_REQUEST_LIMIT(RenameTables, DoRenameTablesRequest, Rps, RENAMETABLES) + ADD_REQUEST_LIMIT(ExplainDataQuery, DoExplainDataQueryRequest, Rps, EXPLAINDATAQUERY) + ADD_REQUEST_LIMIT(ExecuteSchemeQuery, DoExecuteSchemeQueryRequest, Rps, EXECUTESCHEMEQUERY) + ADD_REQUEST_LIMIT(BeginTransaction, DoBeginTransactionRequest, Rps, BEGINTRANSACTION, Auditable) + ADD_REQUEST_LIMIT(DescribeTableOptions, DoDescribeTableOptionsRequest, Rps, DESCRIBETABLEOPTIONS) + + ADD_REQUEST_LIMIT(DeleteSession, DoDeleteSessionRequest, Off, DELETESESSION) + ADD_REQUEST_LIMIT(CommitTransaction, DoCommitTransactionRequest, Off, COMMITTRANSACTION, Auditable) + ADD_REQUEST_LIMIT(RollbackTransaction, DoRollbackTransactionRequest, Off, ROLLBACKTRANSACTION, Auditable) + + ADD_REQUEST_LIMIT(PrepareDataQuery, DoPrepareDataQueryRequest, Ru, PREPAREDATAQUERY, Auditable) + ADD_REQUEST_LIMIT(ExecuteDataQuery, DoExecuteDataQueryRequest, Ru, EXECUTEDATAQUERY, Auditable) + ADD_REQUEST_LIMIT(BulkUpsert, DoBulkUpsertRequest, Ru, BULKUPSERT, Auditable) + + ADD_STREAM_REQUEST_LIMIT(StreamExecuteScanQuery, ExecuteScanQueryRequest, ExecuteScanQueryPartialResponse, DoExecuteScanQueryRequest, RuOnProgress, STREAMEXECUTESCANQUERY) + ADD_STREAM_REQUEST_LIMIT(StreamReadTable, ReadTableRequest, ReadTableResponse, DoReadTableRequest, RuOnProgress, STREAMREADTABLE) + ADD_STREAM_REQUEST_LIMIT(ReadRows, ReadRowsRequest, ReadRowsResponse, DoReadRowsRequest, Ru, READROWS) #undef ADD_REQUEST_LIMIT #undef ADD_STREAM_REQUEST_LIMIT diff --git a/ydb/tools/cfg/static.py b/ydb/tools/cfg/static.py index 84bf540891da..95faefd86e5a 100644 --- a/ydb/tools/cfg/static.py +++ b/ydb/tools/cfg/static.py @@ -113,14 +113,13 @@ def __init__( ) ) self._enable_cms_config_cache = template.get("enable_cms_config_cache", enable_cms_config_cache) - if "tracing" in template: - tracing = template["tracing"] + tracing = template.get("tracing_config") + if tracing is not None: self.__tracing = ( - tracing["host"], - tracing["port"], - tracing["root_ca"], - tracing["service_name"], - tracing.get("auth_config") + tracing["backend"], + tracing.get("uploader"), + tracing.get("sampling", []), + tracing.get("external_throttling", []), ) else: self.__tracing = None @@ -1121,37 +1120,131 @@ def __generate_sys_txt(self): self.__generate_sys_txt_advanced() def __generate_tracing_txt(self): + def get_selectors(selectors): + selectors_pb = config_pb2.TTracingConfig.TSelectors() + + request_type = selectors["request_type"] + if request_type is not None: + selectors_pb.RequestType = request_type + + return selectors_pb + + def get_sampling_scope(sampling): + sampling_scope_pb = config_pb2.TTracingConfig.TSamplingRule() + selectors = sampling.get("scope") + if selectors is not None: + sampling_scope_pb.Scope.CopyFrom(get_selectors(selectors)) + sampling_scope_pb.Fraction = sampling['fraction'] + sampling_scope_pb.Level = sampling['level'] + sampling_scope_pb.MaxTracesPerMinute = sampling['max_traces_per_minute'] + sampling_scope_pb.MaxTracesBurst = sampling.get('max_traces_burst', 0) + return sampling_scope_pb + + def get_external_throttling(throttling): + throttling_scope_pb = config_pb2.TTracingConfig.TExternalThrottlingRule() + selectors = throttling.get("scope") + if selectors is not None: + throttling_scope_pb.Scope.CopyFrom(get_selectors(selectors)) + throttling_scope_pb.MaxTracesPerMinute = throttling['max_traces_per_minute'] + throttling_scope_pb.MaxTracesBurst = throttling.get('max_traces_burst', 0) + return throttling_scope_pb + + def get_auth_config(auth): + auth_pb = config_pb2.TTracingConfig.TBackendConfig.TAuthConfig() + tvm = auth.get("tvm") + if tvm is not None: + tvm_pb = auth_pb.Tvm + + if "host" in tvm: + tvm_pb.Host = tvm["host"] + if "port" in tvm: + tvm_pb.Port = tvm["port"] + tvm_pb.SelfTvmId = tvm["self_tvm_id"] + tvm_pb.TracingTvmId = tvm["tracing_tvm_id"] + if "disk_cache_dir" in tvm: + tvm_pb.DiskCacheDir = tvm["disk_cache_dir"] + + if "plain_text_secret" in tvm: + tvm_pb.PlainTextSecret = tvm["plain_text_secret"] + elif "secret_file" in tvm: + tvm_pb.SecretFile = tvm["secret_file"] + elif "secret_environment_variable" in tvm: + tvm_pb.SecretEnvironmentVariable = tvm["secret_environment_variable"] + return auth_pb + + def get_opentelemetry(opentelemetry): + opentelemetry_pb = config_pb2.TTracingConfig.TBackendConfig.TOpentelemetryBackend() + + opentelemetry_pb.CollectorUrl = opentelemetry["collector_url"] + opentelemetry_pb.ServiceName = opentelemetry["service_name"] + + return opentelemetry_pb + + def get_backend(backend): + backend_pb = config_pb2.TTracingConfig.TBackendConfig() + + auth = backend.get("auth_config") + if auth is not None: + backend_pb.AuthConfig.CopyFrom(get_auth_config(auth)) + + opentelemetry = backend["opentelemetry"] + if opentelemetry is not None: + backend_pb.Opentelemetry.CopyFrom(get_opentelemetry(opentelemetry)) + + return backend_pb + + def get_uploader(uploader): + uploader_pb = config_pb2.TTracingConfig.TUploaderConfig() + + max_exported_spans_per_second = uploader.get("max_exported_spans_per_second") + if max_exported_spans_per_second is not None: + uploader_pb.MaxExportedSpansPerSecond = max_exported_spans_per_second + + max_spans_in_batch = uploader.get("max_spans_in_batch") + if max_spans_in_batch is not None: + uploader_pb.MaxSpansInBatch = max_spans_in_batch + + max_bytes_in_batch = uploader.get("max_bytes_in_batch") + if max_bytes_in_batch is not None: + uploader_pb.MaxBytesInBatch = max_bytes_in_batch + + max_batch_accumulation_milliseconds = uploader.get("max_batch_accumulation_milliseconds") + if max_batch_accumulation_milliseconds is not None: + uploader_pb.MaxBatchAccumulationMilliseconds = max_batch_accumulation_milliseconds + + span_export_timeout_seconds = uploader.get("span_export_timeout_seconds") + if span_export_timeout_seconds is not None: + uploader_pb.SpanExportTimeoutSeconds = span_export_timeout_seconds + + max_export_requests_inflight = uploader.get("max_export_requests_inflight") + if max_export_requests_inflight is not None: + uploader_pb.MaxExportRequestsInflight = max_export_requests_inflight + + return uploader_pb + pb = config_pb2.TAppConfig() if self.__tracing: tracing_pb = pb.TracingConfig ( - tracing_pb.Host, - tracing_pb.Port, - tracing_pb.RootCA, - tracing_pb.ServiceName, - auth_config + backend, + uploader, + sampling, + external_throttling ) = self.__tracing - if auth_config: - auth_pb = tracing_pb.AuthConfig - if "tvm" in auth_config: - tvm = auth_config.get("tvm") - tvm_pb = auth_pb.Tvm - - if "host" in tvm: - tvm_pb.Host = tvm["host"] - if "port" in tvm: - tvm_pb.Port = tvm["port"] - tvm_pb.SelfTvmId = tvm["self_tvm_id"] - tvm_pb.TracingTvmId = tvm["tracing_tvm_id"] - tvm_pb.DiskCacheDir = tvm["disk_cache_dir"] + assert isinstance(sampling, list) + assert isinstance(external_throttling, list) + + tracing_pb.Backend.CopyFrom(get_backend(backend)) + + if uploader is not None: + tracing_pb.Uploader.CopyFrom(get_uploader(uploader)) + + for sampling_scope in sampling: + tracing_pb.Sampling.append(get_sampling_scope(sampling_scope)) - if "plain_text_secret" in tvm: - tvm_pb.PlainTextSecret = tvm["plain_text_secret"] - elif "secret_file" in tvm: - tvm_pb.SecretFile = tvm["secret_file"] - elif "secret_environment_variable" in tvm: - tvm_pb.SecretEnvironmentVariable = tvm["secret_environment_variable"] + for throttling_scope in external_throttling: + tracing_pb.ExternalThrottling.append(get_external_throttling(throttling_scope)) self.__proto_configs["tracing.txt"] = pb diff --git a/ydb/tools/cfg/validation.py b/ydb/tools/cfg/validation.py index 4e17e4ac78b2..e1b4315c5e40 100644 --- a/ydb/tools/cfg/validation.py +++ b/ydb/tools/cfg/validation.py @@ -126,21 +126,91 @@ "additionalProperties": False, } +SELECTORS_CONFIGS = dict( + type="object", + properties=dict( + request_type=dict(type="string"), + ), + required=[], + additionalProperties=False, +) + TRACING_SCHEMA = dict( type="object", properties=dict( - host=dict(type="string"), - port=dict(type="integer"), - root_ca=dict(type="string"), - service_name=dict(type="string"), - auth_config=dict(type="object"), + backend=dict( + type="object", + properties=dict( + auth_config=dict( + type="object", + properties=dict( + tvm=dict( + type="object", + properties=dict( + url=dict(type="string"), + self_tvm_id=dict(type="integer"), + tracing_tvm_id=dict(type="integer"), + disc_cache_dir=dict(type="string"), + plain_text_secret=dict(type="string"), + secret_file=dict(type="string"), + secret_environment_variable=dict(type="string"), + ), + required=["self_tvm_id", "tracing_tvm_id"], + ) + ), + required=["tvm"], + ), + opentelemetry=dict( + type="object", + properties=dict( + collector_url=dict(type="string"), + service_name=dict(type="string"), + ) + ), + ), + required=["opentelemetry"], + additionalProperties=False, + ), + uploader=dict( + type="object", + properties=dict( + max_exported_spans_per_second=dict(type="integer", minimum=1), + max_spans_in_batch=dict(type="integer", minimum=1), + max_bytes_in_batch=dict(type="integer"), + max_batch_accumulation_milliseconds=dict(type="integer"), + span_export_timeout_seconds=dict(type="integer", minimum=1), + max_export_requests_inflight=dict(type="integer", minimum=1), + ), + additionalProperties=False, + ), + sampling=dict( + type="array", + items=dict( + type="object", + properties=dict( + scope=SELECTORS_CONFIGS, + fraction=dict(type="number", minimum=0, maximum=1), + level=dict(type="integer", minimum=0, maximum=15), + max_traces_per_minute=dict(type="integer", minimum=0), + max_traces_burst=dict(type="integer", minimum=0), + ), + required=["fraction", "level", "max_traces_per_minute"], + ), + ), + external_throttling=dict( + type="array", + items=dict( + type="object", + properties=dict( + scope=SELECTORS_CONFIGS, + max_traces_per_minute=dict(type="integer", minimum=0), + max_traces_burst=dict(type="integer", minimum=0), + ), + required=["max_traces_per_minute"], + ), + ), ), - required=[ - "host", - "port", - "root_ca", - "service_name", - ], + required=["backend"], additionalProperties=False, ) @@ -925,7 +995,7 @@ "features": copy.deepcopy(FEATURES_SCHEMA), "shared_cache": copy.deepcopy(SHARED_CACHE_SCHEMA), "sys": copy.deepcopy(SYS_SCHEMA), - "tracing": copy.deepcopy(TRACING_SCHEMA), + "tracing_config": copy.deepcopy(TRACING_SCHEMA), "failure_injection_config": copy.deepcopy(FAILURE_INJECTION_CONFIG_SCHEMA), "solomon": copy.deepcopy(SOLOMON_SCHEMA), "cms": copy.deepcopy(CMS_SCHEMA),