Skip to content

Commit

Permalink
Merge bea3bec into 636451e
Browse files Browse the repository at this point in the history
  • Loading branch information
alephonea authored May 22, 2024
2 parents 636451e + bea3bec commit e23d56f
Show file tree
Hide file tree
Showing 22 changed files with 311 additions and 258 deletions.
18 changes: 6 additions & 12 deletions ydb/core/kqp/opt/kqp_statistics_transformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ void InferStatisticsForReadTable(const TExprNode::TPtr& input, TTypeAnnotationCo

YQL_CLOG(TRACE, CoreDq) << "Infer statistics for read table, nrows:" << nRows << ", nattrs: " << nAttrs;

auto outputStats = TOptimizerStatistics(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0.0, tableData.Metadata->KeyColumnNames);
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(outputStats));
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0.0, tableData.Metadata->KeyColumnNames));
}

/**
Expand All @@ -63,8 +62,7 @@ void InferStatisticsForKqpTable(const TExprNode::TPtr& input, TTypeAnnotationCon
int nAttrs = tableData.Metadata->Columns.size();
YQL_CLOG(TRACE, CoreDq) << "Infer statistics for table: " << path.Value() << ", nrows: " << nRows << ", nattrs: " << nAttrs << ", nKeyColumns: " << tableData.Metadata->KeyColumnNames.size();

auto outputStats = TOptimizerStatistics(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0.0, tableData.Metadata->KeyColumnNames);
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(outputStats));
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0.0, tableData.Metadata->KeyColumnNames));
}

/**
Expand All @@ -84,8 +82,7 @@ void InferStatisticsForSteamLookup(const TExprNode::TPtr& input, TTypeAnnotation
auto inputStats = typeCtx->GetStats(streamLookup.Table().Raw());
auto byteSize = inputStats->ByteSize * (nAttrs / (double) inputStats->Ncols);

auto outputStats = TOptimizerStatistics(EStatisticsType::BaseTable, inputStats->Nrows, nAttrs, byteSize, 0, inputStats->KeyColumns);
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(outputStats));
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, inputStats->Nrows, nAttrs, byteSize, 0, inputStats->KeyColumns));
}

/**
Expand Down Expand Up @@ -116,8 +113,7 @@ void InferStatisticsForLookupTable(const TExprNode::TPtr& input, TTypeAnnotation
byteSize = 10;
}

auto outputStats = TOptimizerStatistics(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0, inputStats->KeyColumns);
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(outputStats));
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0, inputStats->KeyColumns));
}

/**
Expand Down Expand Up @@ -151,17 +147,15 @@ void InferStatisticsForRowsSourceSettings(const TExprNode::TPtr& input, TTypeAnn
double cost = inputStats->Cost;
double byteSize = inputStats->ByteSize * (nAttrs / (double)inputStats->Ncols);

auto outputStats = TOptimizerStatistics(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, cost, inputStats->KeyColumns);
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(outputStats));
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, cost, inputStats->KeyColumns));
}

/**
* Compute statistics for index lookup
* Currently we just make up a number for cardinality (5) and set cost to 0
*/
void InferStatisticsForIndexLookup(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) {
auto outputStats = TOptimizerStatistics(EStatisticsType::BaseTable, 5, 5, 20, 0.0);
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(outputStats));
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, 5, 5, 20, 0.0));
}

/***
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/kqp/opt/logical/kqp_opt_cbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct TKqpRelOptimizerNode : public NYql::TRelOptimizerNode {
/**
* KQP Specific cost function and join applicability cost function
*/
struct TKqpProviderContext : public NYql::IProviderContext {
struct TKqpProviderContext : public NYql::TBaseProviderContext {
TKqpProviderContext(const TKqpOptimizeContext& kqpCtx, const int optLevel) : KqpCtx(kqpCtx), OptLevel(optLevel) {}

virtual bool IsJoinApplicable(const std::shared_ptr<NYql::IBaseOptimizerNode>& left,
Expand All @@ -35,4 +35,4 @@ struct TKqpProviderContext : public NYql::IProviderContext {
int OptLevel;
};

}
}
125 changes: 124 additions & 1 deletion ydb/library/yql/core/cbo/cbo_optimizer_new.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

#include <library/cpp/disjoint_sets/disjoint_sets.h>

const TString& ToString(NYql::EJoinKind);
const TString& ToString(NYql::EJoinAlgoType);

namespace NYql {

using namespace NYql::NDq;
Expand Down Expand Up @@ -89,7 +92,8 @@ void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) {
stream << " ";
}

stream << "Join: (" << ConvertToJoinString(JoinType) << "," << ConvertToJoinAlgoString(JoinAlgo) << ") ";
stream << "Join: (" << ToString(JoinType) << "," << ToString(JoinAlgo) << ") ";

for (auto c : JoinConditions){
stream << c.first.RelName << "." << c.first.AttributeName
<< "=" << c.second.RelName << "."
Expand All @@ -109,4 +113,123 @@ void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) {
RightArg->Print(stream, ntabs+1);
}

bool IsPKJoin(const TOptimizerStatistics& stats, const TVector<TString>& joinKeys) {
if (stats.KeyColumns.size() == 0) {
return false;
}

for(size_t i = 0; i < stats.KeyColumns.size(); i++){
if (std::find(joinKeys.begin(), joinKeys.end(), stats.KeyColumns[i]) == joinKeys.end()) {
return false;
}
}
return true;
}

bool TBaseProviderContext::IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left,
const std::shared_ptr<IBaseOptimizerNode>& right,
const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions,
const TVector<TString>& leftJoinKeys,
const TVector<TString>& rightJoinKeys,
EJoinAlgoType joinAlgo) {

Y_UNUSED(left);
Y_UNUSED(right);
Y_UNUSED(joinConditions);
Y_UNUSED(leftJoinKeys);
Y_UNUSED(rightJoinKeys);

return joinAlgo == EJoinAlgoType::MapJoin;
}

double TBaseProviderContext::ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const {
Y_UNUSED(outputByteSize);
Y_UNUSED(joinAlgo);
return leftStats.Nrows + 2.0 * rightStats.Nrows + outputRows;
}

/**
* Compute the cost and output cardinality of a join
*
* Currently a very basic computation targeted at GraceJoin
*
* The build is on the right side, so we make the build side a bit more expensive than the probe
*/
TOptimizerStatistics TBaseProviderContext::ComputeJoinStats(
const TOptimizerStatistics& leftStats,
const TOptimizerStatistics& rightStats,
const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions,
EJoinAlgoType joinAlgo) const
{
TVector<TString> leftJoinKeys;
TVector<TString> rightJoinKeys;

for (auto c : joinConditions) {
leftJoinKeys.emplace_back(c.first.AttributeName);
rightJoinKeys.emplace_back(c.second.AttributeName);
}

return ComputeJoinStats(leftStats, rightStats, leftJoinKeys, rightJoinKeys, joinAlgo);
}

TOptimizerStatistics TBaseProviderContext::ComputeJoinStats(
const TOptimizerStatistics& leftStats,
const TOptimizerStatistics& rightStats,
const TVector<TString>& leftJoinKeys,
const TVector<TString>& rightJoinKeys,
EJoinAlgoType joinAlgo) const
{
double newCard;
EStatisticsType outputType;
bool leftKeyColumns = false;
bool rightKeyColumns = false;
double selectivity = 1.0;


if (IsPKJoin(rightStats,rightJoinKeys)) {
newCard = leftStats.Nrows * rightStats.Selectivity;
selectivity = leftStats.Selectivity * rightStats.Selectivity;
leftKeyColumns = true;
if (leftStats.Type == EStatisticsType::BaseTable){
outputType = EStatisticsType::FilteredFactTable;
} else {
outputType = leftStats.Type;
}
}
else if (IsPKJoin(leftStats,leftJoinKeys)) {
newCard = rightStats.Nrows;
newCard = rightStats.Nrows * leftStats.Selectivity;
selectivity = leftStats.Selectivity * rightStats.Selectivity;

rightKeyColumns = true;
if (rightStats.Type == EStatisticsType::BaseTable){
outputType = EStatisticsType::FilteredFactTable;
} else {
outputType = rightStats.Type;
}
}
else {
newCard = 0.2 * leftStats.Nrows * rightStats.Nrows;
outputType = EStatisticsType::ManyManyJoin;
}

int newNCols = leftStats.Ncols + rightStats.Ncols;
double newByteSize = leftStats.Nrows ? (leftStats.ByteSize / leftStats.Nrows) * newCard : 0 +
rightStats.Nrows ? (rightStats.ByteSize / rightStats.Nrows) * newCard : 0;

double cost = ComputeJoinCost(leftStats, rightStats, newCard, newByteSize, joinAlgo)
+ leftStats.Cost + rightStats.Cost;

auto result = TOptimizerStatistics(outputType, newCard, newNCols, newByteSize, cost,
leftKeyColumns ? leftStats.KeyColumns : ( rightKeyColumns ? rightStats.KeyColumns : TOptimizerStatistics::EmptyColumns));
result.Selectivity = selectivity;
return result;
}

const TBaseProviderContext& TBaseProviderContext::Instance() {
static TBaseProviderContext staticContext;
return staticContext;
}


} // namespace NYql
Loading

0 comments on commit e23d56f

Please sign in to comment.