Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bug in S3 Physical Optimizer #12985

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2306,6 +2306,96 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "secret with name 'TestSecret' not found");
}

Y_UNIT_TEST(TestOlapToS3Insert) {
const TString root = "/Root/";
const TString source = "source";
const TString table1 = "table1";
const TString table2 = "table2";
const TString bucket = "bucket";

CreateBucket(bucket);

auto kikimr = NTestUtils::MakeKikimrRunner();

auto tc = kikimr->GetTableClient();
auto session = tc.CreateSession().GetValueSync().GetSession();

const TString olapTable = "DestinationOlap";

const TString query = fmt::format(R"(
CREATE EXTERNAL DATA SOURCE `{source}` WITH (
SOURCE_TYPE="ObjectStorage",
LOCATION="{location}",
AUTH_METHOD="NONE"
);
CREATE EXTERNAL TABLE `{table1}` (
key Int64 NOT NULL,
value String NOT NULL,
) WITH (
DATA_SOURCE="{source}",
LOCATION="/{location_table1}/",
FORMAT="csv_with_names"
);
CREATE EXTERNAL TABLE `{table2}` (
key Int64 NOT NULL,
value String NOT NULL,
year String NOT NULL
) WITH (
DATA_SOURCE="{source}",
LOCATION="/{location_table2}/",
FORMAT="csv_with_names",
PARTITIONED_BY="['year']"
);
CREATE TABLE `{olap_table}` (
key Int64 NOT NULL,
value String NOT NULL,
PRIMARY KEY (key)
)
WITH (STORE = COLUMN);)",
"location"_a = GetBucketLocation(bucket),
"source"_a = root + source,
"table1"_a = root + table1,
"table2"_a = root + table2,
"location_table1"_a = table1,
"location_table2"_a = table2,
"olap_table"_a = olapTable
);
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), NYdb::EStatus::SUCCESS, result.GetIssues().ToString());

auto db = kikimr->GetQueryClient();

{
const TString sql = fmt::format(R"(
INSERT INTO {destination}
SELECT key, value FROM {source} LIMIT 1;)",
"destination"_a = table1,
"source"_a = olapTable);

auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
UNIT_ASSERT(scriptExecutionOperation.Metadata().ExecutionId);

NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString());
}

{
const TString sql = fmt::format(R"(
INSERT INTO {destination}
SELECT key, value, "2024" AS year FROM {source} LIMIT 1;)",
"destination"_a = table2,
"source"_a = olapTable);

auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
UNIT_ASSERT(scriptExecutionOperation.Metadata().ExecutionId);

NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString());
}
}
}

} // namespace NKikimr::NKqp
127 changes: 64 additions & 63 deletions ydb/library/yql/providers/s3/provider/yql_s3_phy_opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,22 +199,34 @@ class TS3PhysicalOptProposalTransformer : public TOptimizeTransformerBase {
}
}

if (!TDqCnUnionAll::Match(input.Raw())) {
return Nothing();
}

const TParentsMap* parentsMap = getParents();
const auto dqUnion = input.Cast<TDqCnUnionAll>();
if (!NDq::IsSingleConsumerConnection(dqUnion, *parentsMap)) {
return Nothing();
}

const auto inputStage = dqUnion.Output().Stage().Cast<TDqStage>();

if (!FindNode(input.Ptr(), [] (const TExprNode::TPtr& node) { return node->IsCallable(TCoDataSource::CallableName()); })) {
YQL_CLOG(INFO, ProviderS3) << "Rewrite pure S3WriteObject `" << cluster << "`.`" << target.Path().StringValue() << "` as stage with sink.";
return keys.empty() ?
Build<TDqStage>(ctx, writePos)
.Inputs().Build()
.Program<TCoLambda>()
.Args({})
.Body<TS3SinkOutput>()
.Input<TCoToFlow>()
.Input(input)
.Build()
.Format(target.Format())
.KeyColumns().Build()
.Settings(sinkOutputSettingsBuilder.Done())
.Build()

if (keys.empty()) {
const auto outputBuilder = Build<TS3SinkOutput>(ctx, target.Pos())
.Input<TCoToFlow>()
.Input(inputStage.Program().Body().Ptr())
.Build()
.Format(target.Format())
.KeyColumns().Build()
.Settings(sinkOutputSettingsBuilder.Done())
.Done();

return Build<TDqStage>(ctx, writePos)
.InitFrom(inputStage)
.Program(ctx.DeepCopyLambda(inputStage.Program().Ref(), outputBuilder.Ptr()))
.Outputs<TDqStageOutputsList>()
.Add<TDqSink>()
.DataSink(dataSink)
Expand All @@ -230,68 +242,57 @@ class TS3PhysicalOptProposalTransformer : public TOptimizeTransformerBase {
.Build()
.Build()
.Settings().Build()
.Done()
:
Build<TDqStage>(ctx, writePos)
.Inputs()
.Add<TDqCnHashShuffle>()
.Output<TDqOutput>()
.Stage<TDqStage>()
.Inputs().Build()
.Program<TCoLambda>()
.Args({})
.Body<TCoToFlow>()
.Input(input)
.Build()
.Build()
.Settings().Build()
.Build()
.Index().Value("0", TNodeFlags::Default).Build()
.Done();
} else {
const auto outputBuilder = Build<TCoToFlow>(ctx, target.Pos())
.Input(inputStage.Program().Body().Ptr())
.Done();

return Build<TDqStage>(ctx, writePos)
.Inputs()
.Add<TDqCnHashShuffle>()
.Output<TDqOutput>()
.Stage<TDqStage>()
.InitFrom(inputStage)
.Program(ctx.DeepCopyLambda(inputStage.Program().Ref(), outputBuilder.Ptr()))
.Settings().Build()
.Build()
.KeyColumns().Add(keys).Build()
.Index().Value("0", TNodeFlags::Default).Build()
.Build()
.KeyColumns().Add(keys).Build()
.Build()
.Program<TCoLambda>()
.Args({"in"})
.Body<TS3SinkOutput>()
.Input("in")
.Format(target.Format())
.KeyColumns().Add(keys).Build()
.Settings(sinkOutputSettingsBuilder.Done())
.Build()
.Build()
.Program<TCoLambda>()
.Args({"in"})
.Body<TS3SinkOutput>()
.Input("in")
.Format(target.Format())
.KeyColumns().Add(keys).Build()
.Settings(sinkOutputSettingsBuilder.Done())
.Build()
.Outputs<TDqStageOutputsList>()
.Add<TDqSink>()
.DataSink(dataSink)
.Index().Value("0", TNodeFlags::Default).Build()
.Settings<TS3SinkSettings>()
.Path(target.Path())
.Settings(sinkSettingsBuilder.Done())
.Token<TCoSecureParam>()
.Name().Build(token)
.Build()
.Extension().Value(extension).Build()
.Build()
.Outputs<TDqStageOutputsList>()
.Add<TDqSink>()
.DataSink(dataSink)
.Index().Value("0", TNodeFlags::Default).Build()
.Settings<TS3SinkSettings>()
.Path(target.Path())
.Settings(sinkSettingsBuilder.Done())
.Token<TCoSecureParam>()
.Name().Build(token)
.Build()
.Extension().Value(extension).Build()
.Build()
.Build()
.Settings().Build()
.Done();
}

if (!TDqCnUnionAll::Match(input.Raw())) {
return Nothing();
}
.Build()
.Settings().Build()
.Done();
}

const TParentsMap* parentsMap = getParents();
const auto dqUnion = input.Cast<TDqCnUnionAll>();
if (!NDq::IsSingleConsumerConnection(dqUnion, *parentsMap)) {
return Nothing();
}

YQL_CLOG(INFO, ProviderS3) << "Rewrite S3WriteObject `" << cluster << "`.`" << target.Path().StringValue() << "` as sink.";

const auto inputStage = dqUnion.Output().Stage().Cast<TDqStage>();

const auto sink = Build<TDqSink>(ctx, writePos)
.DataSink(dataSink)
.Index(dqUnion.Output().Index())
Expand Down
Loading