From 84a22ebb8b81d645dd860048440a4417376690bf Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Wed, 26 Jun 2024 22:50:56 +0200 Subject: [PATCH] non partitioned lister has been sped up (#5870) --- .../provider/yql_s3_datasource_type_ann.cpp | 6 ---- .../s3/provider/yql_s3_io_discovery.cpp | 4 +++ .../s3/range_helpers/file_tree_builder.cpp | 3 +- .../s3/range_helpers/file_tree_builder_ut.cpp | 30 +++++++++++++++++++ .../s3/range_helpers/path_list_reader.cpp | 4 +-- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp index 43cd5c11acb1..6c9be1472d0d 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp @@ -22,12 +22,6 @@ bool ValidateS3PackedPaths(TPositionHandle pos, TStringBuf blob, bool isTextEnco try { TPathList paths; UnpackPathsList(blob, isTextEncoded, paths); - for (size_t i = 0; i < paths.size(); ++i) { - if (paths[i].Path.empty()) { - ctx.AddError(TIssue(ctx.GetPosition(pos), TStringBuilder() << "Expected non-empty path (index " << i << ")")); - return false; - } - } } catch (const std::exception& ex) { ctx.AddError(TIssue(ctx.GetPosition(pos), TStringBuilder() << "Failed to parse packed paths: " << ex.what())); return false; diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index cf38f80382bd..6d3f3275ae28 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -864,6 +864,10 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { entries.Directories.back().Path = req.S3Request.Pattern; future = NThreading::MakeFuture(std::move(entries)); } else { + auto useRuntimeListing = State_->Configuration->UseRuntimeListing.Get().GetOrElse(false); + if (useRuntimeListing && !req.Options.IsPartitionedDataset) { + req.Options.MaxResultSet = 1; + } future = ListingStrategy_->List(req.S3Request, req.Options); } PendingRequests_[req] = future; diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp index a2ec1d2c3635..4eb9efe4e543 100644 --- a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp +++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp @@ -8,7 +8,7 @@ void TFileTreeBuilder::AddPath(const TString& path, ui64 fileSize, bool isDirect for (size_t i = 0, size = parts.size(); i < size; ++i) { bool isSubDirectory = i != size - 1; if (!isSubDirectory) { - TPath& p = (*currentChildren)[TTreeKey{parts[i], isDirectory}]; + TPath& p = (*currentChildren)[TTreeKey{path == "" ? "/" : parts[i], isDirectory}]; Y_ABORT_UNLESS(p.FileSize == 0); Y_ABORT_UNLESS(!p.Read); p.FileSize = fileSize; @@ -16,7 +16,6 @@ void TFileTreeBuilder::AddPath(const TString& path, ui64 fileSize, bool isDirect } else { TPath& p = (*currentChildren)[TTreeKey{parts[i], isSubDirectory}]; currentChildren = &p.Children; - } } } diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp index 4ae5a609ab35..07ca574e18d4 100644 --- a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp +++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp @@ -179,6 +179,36 @@ Y_UNIT_TEST_SUITE(S3FileTreeBuilderTest) { UNIT_ASSERT_VALUES_EQUAL(paths[1].IsDirectory, false); UNIT_ASSERT_VALUES_EQUAL(paths[1].PathIndex, 1); } + + Y_UNIT_TEST(DeserializesRootSlash) { + TFileTreeBuilder b; + b.AddPath("/root/name/", 3, true); + b.AddPath("", 42, true); + b.AddPath("//", 42, true); + + NS3::TRange range; + b.Save(&range); + + TPathList paths; + ReadPathsList({}, MakeParams(range), {}, paths); + + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 3); + + UNIT_ASSERT_VALUES_EQUAL(paths[0].Path, "//"); + UNIT_ASSERT_VALUES_EQUAL(paths[0].Size, 42); + UNIT_ASSERT_VALUES_EQUAL(paths[0].IsDirectory, true); + UNIT_ASSERT_VALUES_EQUAL(paths[0].PathIndex, 0); + + UNIT_ASSERT_VALUES_EQUAL(paths[1].Path, "/root/name/"); + UNIT_ASSERT_VALUES_EQUAL(paths[1].Size, 3); + UNIT_ASSERT_VALUES_EQUAL(paths[1].IsDirectory, true); + UNIT_ASSERT_VALUES_EQUAL(paths[1].PathIndex, 1); + + UNIT_ASSERT_VALUES_EQUAL(paths[2].Path, ""); + UNIT_ASSERT_VALUES_EQUAL(paths[2].Size, 42); + UNIT_ASSERT_VALUES_EQUAL(paths[2].IsDirectory, true); + UNIT_ASSERT_VALUES_EQUAL(paths[2].PathIndex, 2); + } } } // namespace NYql::NS3Details \ No newline at end of file diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp index d30685a45fce..54cf3b3cd963 100644 --- a/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp +++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp @@ -22,10 +22,10 @@ static void BuildPathsFromTree(const google::protobuf::RepeatedPtrField