From b38cf1e4093c12732fb775e4fb6f3180f42ad721 Mon Sep 17 00:00:00 2001 From: Dmitry Orlov Date: Wed, 17 Apr 2024 16:23:06 +0300 Subject: [PATCH 1/3] Refactoring to perform cardinality estimation specifically for YT. --- .../yql/core/cbo/cbo_optimizer_new.cpp | 119 +++++++++++++++ ydb/library/yql/core/cbo/cbo_optimizer_new.h | 136 +++++++++--------- ydb/library/yql/core/yql_cost_function.cpp | 103 +------------ ydb/library/yql/core/yql_cost_function.h | 40 +++--- .../yql/dq/opt/dq_opt_join_cost_based.cpp | 3 +- ydb/library/yql/dq/opt/dq_opt_stat.cpp | 4 +- ydb/library/yql/dq/opt/dq_opt_stat.h | 2 + 7 files changed, 216 insertions(+), 191 deletions(-) diff --git a/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp b/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp index 9fbfda733e2f..d317ecafad77 100644 --- a/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp +++ b/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp @@ -109,4 +109,123 @@ void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) { RightArg->Print(stream, ntabs+1); } +bool IsPKJoin(const TOptimizerStatistics& stats, const TVector& joinKeys) { + if (stats.KeyColumns.size()==0) { + return false; + } + + for(size_t i=0; i& left, + const std::shared_ptr& right, + const std::set>& joinConditions, + const TVector& leftJoinKeys, + const TVector& rightJoinKeys, + EJoinAlgoType joinAlgo) { + + Y_UNUSED(left); + Y_UNUSED(right); + Y_UNUSED(joinConditions); + Y_UNUSED(leftJoinKeys); + Y_UNUSED(rightJoinKeys); + + return joinAlgo == EJoinAlgoType::MapJoin; +} + +double TDummyProviderContext::ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const { + Y_UNUSED(outputByteSize); + Y_UNUSED(joinAlgo); + return leftStats.Nrows + 2.0 * rightStats.Nrows + outputRows; +} + +/** + * Compute the cost and output cardinality of a join + * + * Currently a very basic computation targeted at GraceJoin + * + * The build is on the right side, so we make the build side a bit more expensive than the probe +*/ +TOptimizerStatistics TDummyProviderContext::ComputeJoinStats( + const TOptimizerStatistics& leftStats, + const TOptimizerStatistics& rightStats, + const std::set>& joinConditions, + EJoinAlgoType joinAlgo) const +{ + TVector leftJoinKeys; + TVector rightJoinKeys; + + for (auto c : joinConditions) { + leftJoinKeys.emplace_back(c.first.AttributeName); + rightJoinKeys.emplace_back(c.second.AttributeName); + } + + return ComputeJoinStats(leftStats, rightStats, leftJoinKeys, rightJoinKeys, joinAlgo); +} + +TOptimizerStatistics TDummyProviderContext::ComputeJoinStats( + const TOptimizerStatistics& leftStats, + const TOptimizerStatistics& rightStats, + const TVector& leftJoinKeys, + const TVector& rightJoinKeys, + EJoinAlgoType joinAlgo) const +{ + double newCard; + EStatisticsType outputType; + bool leftKeyColumns = false; + bool rightKeyColumns = false; + double selectivity = 1.0; + + + if (IsPKJoin(rightStats,rightJoinKeys)) { + newCard = leftStats.Nrows * rightStats.Selectivity; + selectivity = leftStats.Selectivity * rightStats.Selectivity; + leftKeyColumns = true; + if (leftStats.Type == EStatisticsType::BaseTable){ + outputType = EStatisticsType::FilteredFactTable; + } else { + outputType = leftStats.Type; + } + } + else if (IsPKJoin(leftStats,leftJoinKeys)) { + newCard = rightStats.Nrows; + newCard = rightStats.Nrows * leftStats.Selectivity; + selectivity = leftStats.Selectivity * rightStats.Selectivity; + + rightKeyColumns = true; + if (rightStats.Type == EStatisticsType::BaseTable){ + outputType = EStatisticsType::FilteredFactTable; + } else { + outputType = rightStats.Type; + } + } + else { + newCard = 0.2 * leftStats.Nrows * rightStats.Nrows; + outputType = EStatisticsType::ManyManyJoin; + } + + int newNCols = leftStats.Ncols + rightStats.Ncols; + double newByteSize = leftStats.Nrows ? (leftStats.ByteSize / leftStats.Nrows) * newCard : 0 + + rightStats.Nrows ? (rightStats.ByteSize / rightStats.Nrows) * newCard : 0; + + double cost = ComputeJoinCost(leftStats, rightStats, newCard, newByteSize, joinAlgo) + + leftStats.Cost + rightStats.Cost; + + auto result = TOptimizerStatistics(outputType, newCard, newNCols, newByteSize, cost, + leftKeyColumns ? leftStats.KeyColumns : ( rightKeyColumns ? rightStats.KeyColumns : TOptimizerStatistics::EmptyColumns)); + result.Selectivity = selectivity; + return result; +} + +const TDummyProviderContext& TDummyProviderContext::instance() { + static TDummyProviderContext staticContext; + return staticContext; +} + + } // namespace NYql diff --git a/ydb/library/yql/core/cbo/cbo_optimizer_new.h b/ydb/library/yql/core/cbo/cbo_optimizer_new.h index 63450edcc8fe..b224577964d9 100644 --- a/ydb/library/yql/core/cbo/cbo_optimizer_new.h +++ b/ydb/library/yql/core/cbo/cbo_optimizer_new.h @@ -10,13 +10,12 @@ #include #include - namespace NYql { /** * OptimizerNodes are the internal representations of operators inside the * Cost-based optimizer. Currently we only support RelOptimizerNode - a node that - * is an input relation to the equi-join, and JoinOptimizerNode - an inner join + * is an input relation to the equi-join, and JoinOptimizerNode - an inner join * that connects two sets of relations. */ enum EOptimizerNodeKind: ui32 @@ -35,49 +34,13 @@ struct IBaseOptimizerNode { std::shared_ptr Stats; IBaseOptimizerNode(EOptimizerNodeKind k) : Kind(k) {} - IBaseOptimizerNode(EOptimizerNodeKind k, std::shared_ptr s) : + IBaseOptimizerNode(EOptimizerNodeKind k, std::shared_ptr s) : Kind(k), Stats(s) {} virtual TVector Labels()=0; virtual void Print(std::stringstream& stream, int ntabs=0)=0; }; -/** - * RelOptimizerNode adds a label to base class - * This is the label assinged to the input by equi-Join -*/ -struct TRelOptimizerNode : public IBaseOptimizerNode { - TString Label; - - // Temporary solution to check if a LookupJoin is possible in KQP - //void* Expr; - - TRelOptimizerNode(TString label, std::shared_ptr stats) : - IBaseOptimizerNode(RelNodeType, stats), Label(label) { } - //TRelOptimizerNode(TString label, std::shared_ptr stats, const TExprNode::TPtr expr) : - // IBaseOptimizerNode(RelNodeType, stats), Label(label), Expr(expr) { } - virtual ~TRelOptimizerNode() {} - - virtual TVector Labels(); - virtual void Print(std::stringstream& stream, int ntabs=0); -}; - -enum EJoinKind: ui32 -{ - InnerJoin, - LeftJoin, - RightJoin, - OuterJoin, - LeftOnly, - RightOnly, - LeftSemi, - RightSemi, - Cross, - Exclusion -}; - -EJoinKind ConvertToJoinKind(const TString& joinString); -TString ConvertToJoinString(const EJoinKind kind); /** * This is a temporary structure for KQP provider @@ -89,8 +52,20 @@ struct IProviderContext { virtual double ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgol) const = 0; - virtual bool IsJoinApplicable(const std::shared_ptr& left, - const std::shared_ptr& right, + virtual TOptimizerStatistics ComputeJoinStats( + const TOptimizerStatistics& leftStats, + const TOptimizerStatistics& rightStats, + const std::set>& joinConditions, EJoinAlgoType joinAlgo) const = 0; + + virtual TOptimizerStatistics ComputeJoinStats( + const TOptimizerStatistics& leftStats, + const TOptimizerStatistics& rightStats, + const TVector& leftJoinKeys, + const TVector& rightJoinKeys, + EJoinAlgoType joinAlgo) const = 0; + + virtual bool IsJoinApplicable(const std::shared_ptr& left, + const std::shared_ptr& right, const std::set>& joinConditions, const TVector& leftJoinKeys, const TVector& rightJoinKeys, @@ -105,35 +80,68 @@ struct IProviderContext { struct TDummyProviderContext : public IProviderContext { TDummyProviderContext() {} - double ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const override { - Y_UNUSED(outputByteSize); - Y_UNUSED(joinAlgo); - return leftStats.Nrows + 2.0 * rightStats.Nrows + outputRows; - } + double ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const override; - bool IsJoinApplicable(const std::shared_ptr& left, - const std::shared_ptr& right, + bool IsJoinApplicable(const std::shared_ptr& left, + const std::shared_ptr& right, const std::set>& joinConditions, const TVector& leftJoinKeys, const TVector& rightJoinKeys, - EJoinAlgoType joinAlgo) override { + EJoinAlgoType joinAlgo) override; + + virtual TOptimizerStatistics ComputeJoinStats( + const TOptimizerStatistics& leftStats, + const TOptimizerStatistics& rightStats, + const TVector& leftJoinKeys, + const TVector& rightJoinKeys, + EJoinAlgoType joinAlgo) const override; + + virtual TOptimizerStatistics ComputeJoinStats( + const TOptimizerStatistics& leftStats, + const TOptimizerStatistics& rightStats, + const std::set>& joinConditions, + EJoinAlgoType joinAlgo) const override; - Y_UNUSED(left); - Y_UNUSED(right); - Y_UNUSED(joinConditions); - Y_UNUSED(leftJoinKeys); - Y_UNUSED(rightJoinKeys); + static const TDummyProviderContext& instance(); +}; - return joinAlgo == EJoinAlgoType::MapJoin; - } +/** + * RelOptimizerNode adds a label to base class + * This is the label assinged to the input by equi-Join +*/ +struct TRelOptimizerNode : public IBaseOptimizerNode { + TString Label; + + // Temporary solution to check if a LookupJoin is possible in KQP + //void* Expr; + + TRelOptimizerNode(TString label, std::shared_ptr stats) : + IBaseOptimizerNode(RelNodeType, stats), Label(label) { } + //TRelOptimizerNode(TString label, std::shared_ptr stats, const TExprNode::TPtr expr) : + // IBaseOptimizerNode(RelNodeType, stats), Label(label), Expr(expr) { } + virtual ~TRelOptimizerNode() {} - static const TDummyProviderContext& instance() { - static TDummyProviderContext staticContext; - return staticContext; - } + virtual TVector Labels(); + virtual void Print(std::stringstream& stream, int ntabs=0); +}; +enum EJoinKind: ui32 +{ + InnerJoin, + LeftJoin, + RightJoin, + OuterJoin, + LeftOnly, + RightOnly, + LeftSemi, + RightSemi, + Cross, + Exclusion }; +EJoinKind ConvertToJoinKind(const TString& joinString); +TString ConvertToJoinString(const EJoinKind kind); + /** * JoinOptimizerNode records the left and right arguments of the join * as well as the set of join conditions. @@ -150,11 +158,11 @@ struct TJoinOptimizerNode : public IBaseOptimizerNode { EJoinAlgoType JoinAlgo; bool IsReorderable; - TJoinOptimizerNode(const std::shared_ptr& left, - const std::shared_ptr& right, + TJoinOptimizerNode(const std::shared_ptr& left, + const std::shared_ptr& right, const std::set>& joinConditions, - const EJoinKind joinType, - const EJoinAlgoType joinAlgo, + const EJoinKind joinType, + const EJoinAlgoType joinAlgo, bool nonReorderable=false); virtual ~TJoinOptimizerNode() {} virtual TVector Labels(); diff --git a/ydb/library/yql/core/yql_cost_function.cpp b/ydb/library/yql/core/yql_cost_function.cpp index 078acd70bfb6..105e792ea44a 100644 --- a/ydb/library/yql/core/yql_cost_function.cpp +++ b/ydb/library/yql/core/yql_cost_function.cpp @@ -1,32 +1,7 @@ -#include "yql_cost_function.h" - -#include - -using namespace NYql; - -namespace { - -THashMap JoinAlgoMap = { - {"Undefined",EJoinAlgoType::Undefined}, - {"LookupJoin",EJoinAlgoType::LookupJoin}, - {"MapJoin",EJoinAlgoType::MapJoin}, - {"GraceJoin",EJoinAlgoType::GraceJoin}, - {"StreamLookupJoin",EJoinAlgoType::StreamLookupJoin}}; -bool IsPKJoin(const TOptimizerStatistics& stats, const TVector& joinKeys) { - if (stats.KeyColumns.size()==0) { - return false; - } - - for(size_t i=0; i& leftJoinKeys, const TVector& rightJoinKeys, EJoinAlgoType joinAlgo, const IProviderContext& ctx) { - - double newCard; - EStatisticsType outputType; - bool leftKeyColumns = false; - bool rightKeyColumns = false; - double selectivity = 1.0; - - - if (IsPKJoin(rightStats,rightJoinKeys)) { - newCard = leftStats.Nrows * rightStats.Selectivity; - selectivity = leftStats.Selectivity * rightStats.Selectivity; - leftKeyColumns = true; - if (leftStats.Type == EStatisticsType::BaseTable){ - outputType = EStatisticsType::FilteredFactTable; - } else { - outputType = leftStats.Type; - } - } - else if (IsPKJoin(leftStats,leftJoinKeys)) { - newCard = rightStats.Nrows; - newCard = rightStats.Nrows * leftStats.Selectivity; - selectivity = leftStats.Selectivity * rightStats.Selectivity; - - rightKeyColumns = true; - if (rightStats.Type == EStatisticsType::BaseTable){ - outputType = EStatisticsType::FilteredFactTable; - } else { - outputType = rightStats.Type; - } - } - else { - newCard = 0.2 * leftStats.Nrows * rightStats.Nrows; - outputType = EStatisticsType::ManyManyJoin; - } - - int newNCols = leftStats.Ncols + rightStats.Ncols; - double newByteSize = leftStats.Nrows ? (leftStats.ByteSize / leftStats.Nrows) * newCard : 0 + - rightStats.Nrows ? (rightStats.ByteSize / rightStats.Nrows) * newCard : 0; - - double cost = ctx.ComputeJoinCost(leftStats, rightStats, newCard, newByteSize, joinAlgo) - + leftStats.Cost + rightStats.Cost; - - auto result = TOptimizerStatistics(outputType, newCard, newNCols, newByteSize, cost, - leftKeyColumns ? leftStats.KeyColumns : ( rightKeyColumns ? rightStats.KeyColumns : TOptimizerStatistics::EmptyColumns)); - result.Selectivity = selectivity; - return result; -} - - -TOptimizerStatistics NYql::ComputeJoinStats(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, - const std::set>& joinConditions, EJoinAlgoType joinAlgo, const IProviderContext& ctx) { - - TVector leftJoinKeys; - TVector rightJoinKeys; - - for (auto c : joinConditions) { - leftJoinKeys.emplace_back(c.first.AttributeName); - rightJoinKeys.emplace_back(c.second.AttributeName); - } - - return ComputeJoinStats(leftStats, rightStats, leftJoinKeys, rightJoinKeys, joinAlgo, ctx); -} - +} // namespace NYql diff --git a/ydb/library/yql/core/yql_cost_function.h b/ydb/library/yql/core/yql_cost_function.h index dcacef50a82a..030774f3d303 100644 --- a/ydb/library/yql/core/yql_cost_function.h +++ b/ydb/library/yql/core/yql_cost_function.h @@ -16,16 +16,28 @@ namespace NYql { struct IProviderContext; -namespace NDq { +enum class EJoinAlgoType { + Undefined, + LookupJoin, + MapJoin, + GraceJoin, + StreamLookupJoin //Right part can be updated during an operation. Used mainly for joining streams with lookup tables. Currently impplemented in Dq by LookupInputTransform +}; + +//StreamLookupJoin is not a subject for CBO and not not included here +static constexpr auto AllJoinAlgos = { EJoinAlgoType::MapJoin, EJoinAlgoType::GraceJoin, EJoinAlgoType::LookupJoin }; + +namespace NDq { + /** - * Join column is a struct that records the relation label and + * Join column is a struct that records the relation label and * attribute name, used in join conditions */ struct TJoinColumn { TString RelName; TString AttributeName; - TJoinColumn(TString relName, TString attributeName) : RelName(relName), + TJoinColumn(TString relName, TString attributeName) : RelName(relName), AttributeName(attributeName) {} bool operator == (const TJoinColumn& other) const { @@ -43,26 +55,8 @@ struct TJoinColumn { bool operator < (const TJoinColumn& c1, const TJoinColumn& c2); -} - -enum class EJoinAlgoType { - Undefined, - LookupJoin, - MapJoin, - GraceJoin, - StreamLookupJoin //Right part can be updated during an operation. Used mainly for joining streams with lookup tables. Currently impplemented in Dq by LookupInputTransform -}; +} // namespace NDq TString ConvertToJoinAlgoString(EJoinAlgoType joinAlgo); -//StreamLookupJoin is not a subject for CBO and not not included here -static constexpr auto AllJoinAlgos = { EJoinAlgoType::MapJoin, EJoinAlgoType::GraceJoin, EJoinAlgoType::LookupJoin }; - -TOptimizerStatistics ComputeJoinStats(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, - const std::set>& joinConditions, EJoinAlgoType joinAlgo, const IProviderContext& ctx); - -TOptimizerStatistics ComputeJoinStats(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, - const TVector& leftJoinKeys, const TVector& rightJoinKeys, EJoinAlgoType joinAlgo, const IProviderContext& ctx); - -} - +} // namespace NYql diff --git a/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp b/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp index 809c88611fb9..54354d27bc10 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp @@ -197,13 +197,12 @@ void ComputeStatistics(const std::shared_ptr& join, IProvide ComputeStatistics(static_pointer_cast(join->RightArg), ctx); } join->Stats = std::make_shared( - ComputeJoinStats( + ctx.ComputeJoinStats( *join->LeftArg->Stats, *join->RightArg->Stats, join->LeftJoinKeys, join->RightJoinKeys, EJoinAlgoType::GraceJoin, - ctx ) ); } diff --git a/ydb/library/yql/dq/opt/dq_opt_stat.cpp b/ydb/library/yql/dq/opt/dq_opt_stat.cpp index f5f5e185bb7d..21dde19f8a28 100644 --- a/ydb/library/yql/dq/opt/dq_opt_stat.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_stat.cpp @@ -162,7 +162,7 @@ void InferStatisticsForMapJoin(const TExprNode::TPtr& input, TTypeAnnotationCont } typeCtx->SetStats(join.Raw(), std::make_shared( - ComputeJoinStats(*leftStats, *rightStats, leftJoinKeys, rightJoinKeys, EJoinAlgoType::MapJoin, ctx))); + ctx.ComputeJoinStats(*leftStats, *rightStats, leftJoinKeys, rightJoinKeys, EJoinAlgoType::MapJoin))); } /** @@ -194,7 +194,7 @@ void InferStatisticsForGraceJoin(const TExprNode::TPtr& input, TTypeAnnotationCo } typeCtx->SetStats(join.Raw(), std::make_shared( - ComputeJoinStats(*leftStats, *rightStats, leftJoinKeys, rightJoinKeys, EJoinAlgoType::GraceJoin, ctx))); + ctx.ComputeJoinStats(*leftStats, *rightStats, leftJoinKeys, rightJoinKeys, EJoinAlgoType::GraceJoin))); } /** diff --git a/ydb/library/yql/dq/opt/dq_opt_stat.h b/ydb/library/yql/dq/opt/dq_opt_stat.h index 31013988c3f5..1505f383d933 100644 --- a/ydb/library/yql/dq/opt/dq_opt_stat.h +++ b/ydb/library/yql/dq/opt/dq_opt_stat.h @@ -1,3 +1,5 @@ +#pragma once + #include "dq_opt.h" #include From 3fa1fe2fd333bf7acdeb3256d209af40bee18ea2 Mon Sep 17 00:00:00 2001 From: Dmitry Orlov Date: Thu, 18 Apr 2024 17:16:37 +0300 Subject: [PATCH 2/3] make it compile --- ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h | 4 ++-- ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp | 2 +- ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h b/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h index da042c374058..370ca0804d74 100644 --- a/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h +++ b/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h @@ -405,7 +405,7 @@ template std::shared_ptr TDPHypS for (auto joinAlgo : AllJoinAlgos) { if (ctx.IsJoinApplicable(left, right, joinConditions, leftJoinKeys, rightJoinKeys, joinAlgo)){ - auto cost = ComputeJoinStats(*left->Stats, *right->Stats, leftJoinKeys, rightJoinKeys, joinAlgo, ctx).Cost; + auto cost = ctx.ComputeJoinStats(*left->Stats, *right->Stats, leftJoinKeys, rightJoinKeys, joinAlgo).Cost; if (cost < bestCost) { bestCost = cost; bestAlgo = joinAlgo; @@ -415,7 +415,7 @@ template std::shared_ptr TDPHypS if (isCommutative) { if (ctx.IsJoinApplicable(right, left, reversedJoinConditions, rightJoinKeys, leftJoinKeys, joinAlgo)){ - auto cost = ComputeJoinStats(*right->Stats, *left->Stats, rightJoinKeys, leftJoinKeys, joinAlgo, ctx).Cost; + auto cost = ctx.ComputeJoinStats(*right->Stats, *left->Stats, rightJoinKeys, leftJoinKeys, joinAlgo).Cost; if (cost < bestCost) { bestCost = cost; bestAlgo = joinAlgo; diff --git a/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp b/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp index 54354d27bc10..fd5ed9f2c238 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp @@ -202,7 +202,7 @@ void ComputeStatistics(const std::shared_ptr& join, IProvide *join->RightArg->Stats, join->LeftJoinKeys, join->RightJoinKeys, - EJoinAlgoType::GraceJoin, + EJoinAlgoType::GraceJoin ) ); } diff --git a/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp b/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp index 1c4db89bb627..181ba5db080c 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp @@ -13,7 +13,7 @@ std::shared_ptr MakeJoinInternal( IProviderContext& ctx) { auto res = std::make_shared(left, right, joinConditions, leftJoinKeys, rightJoinKeys, joinKind, joinAlgo); - res->Stats = std::make_shared(ComputeJoinStats(*left->Stats, *right->Stats, leftJoinKeys, rightJoinKeys, joinAlgo, ctx)); + res->Stats = std::make_shared(ctx.ComputeJoinStats(*left->Stats, *right->Stats, leftJoinKeys, rightJoinKeys, joinAlgo)); return res; } From 27ad45db503dc8f5d0829cf8eef59bf2bcf90bba Mon Sep 17 00:00:00 2001 From: Dmitry Orlov Date: Thu, 18 Apr 2024 19:34:18 +0300 Subject: [PATCH 3/3] Add specific statistic --- ydb/library/yql/core/yql_statistics.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ydb/library/yql/core/yql_statistics.h b/ydb/library/yql/core/yql_statistics.h index e92473fb2791..5d2c6a1c91df 100644 --- a/ydb/library/yql/core/yql_statistics.h +++ b/ydb/library/yql/core/yql_statistics.h @@ -13,6 +13,12 @@ enum EStatisticsType : ui32 { ManyManyJoin }; +// Providers may subclass this struct to associate specific statistics, useful to +// derive stats for higher-level operators in the plan. +struct IProviderStatistics { + virtual ~IProviderStatistics() {} +}; + /** * Optimizer Statistics struct records per-table and per-column statistics * for the current operator in the plan. Currently, only Nrows and Ncols are @@ -28,6 +34,7 @@ struct TOptimizerStatistics { double Cost = 0; double Selectivity = 1.0; const TVector& KeyColumns; + const IProviderStatistics* Specific = nullptr; TOptimizerStatistics() : KeyColumns(EmptyColumns) {} TOptimizerStatistics(double nrows, int ncols): Nrows(nrows), Ncols(ncols), KeyColumns(EmptyColumns) {} @@ -36,6 +43,8 @@ struct TOptimizerStatistics { TOptimizerStatistics(EStatisticsType type, double nrows, int ncols, double byteSize, double cost): Type(type), Nrows(nrows), Ncols(ncols), ByteSize(byteSize), Cost(cost), KeyColumns(EmptyColumns) {} TOptimizerStatistics(EStatisticsType type, double nrows, int ncols, double cost, const TVector& keyColumns): Type(type), Nrows(nrows), Ncols(ncols), Cost(cost), KeyColumns(keyColumns) {} TOptimizerStatistics(EStatisticsType type, double nrows, int ncols, double byteSize, double cost, const TVector& keyColumns): Type(type), Nrows(nrows), Ncols(ncols), ByteSize(byteSize), Cost(cost), KeyColumns(keyColumns) {} + TOptimizerStatistics(EStatisticsType type, double nrows, int ncols, double byteSize, double cost, const TVector& keyColumns, IProviderStatistics* specific) + : Type(type), Nrows(nrows), Ncols(ncols), ByteSize(byteSize), Cost(cost), KeyColumns(keyColumns), Specific(specific) {} TOptimizerStatistics& operator+=(const TOptimizerStatistics& other); bool Empty() const;