From b24b561cbcc6c57f43af13a81503a7a402eb10fe Mon Sep 17 00:00:00 2001 From: Ruoxi Sun Date: Thu, 13 Jun 2024 17:19:55 +0800 Subject: [PATCH] Repro WIP --- cpp/CMakePresets.json | 13 ++++++ cpp/src/arrow/dataset/dataset_test.cc | 58 +++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 13d1241990c31..1fdb0c0456512 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -250,6 +250,19 @@ "displayName": "Debug build with tests and more optional components", "cacheVariables": {} }, + { + "name": "fix-41813", + "inherits": [ + "base-debug", + "features-main" + ], + "displayName": "Fix 41813", + "cacheVariables": { + "ARROW_JEMALLOC": "OFF", + "ARROW_MIMALLOC": "OFF", + "ARROW_USE_ASAN": "ON" + } + }, { "name": "ninja-debug-cuda", "inherits": [ diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index eb3fd0e304750..c2f32cf196711 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -21,6 +21,7 @@ #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/discovery.h" +#include "arrow/dataset/file_parquet.h" #include "arrow/dataset/partition.h" #include "arrow/dataset/test_util_internal.h" #include "arrow/filesystem/mockfs.h" @@ -801,5 +802,62 @@ TEST(TestDictPartitionColumn, SelectPartitionColumnFilterPhysicalColumn) { *ArrayFromJSON(partition_field->type(), R"(["one"])")); } +namespace fs = arrow::fs; +namespace ds = arrow::dataset; +namespace cp = arrow::compute; + +arrow::Result> GetFileSystemFromUri( + const std::string& uri, std::string* path) { + return fs::FileSystemFromUri(uri, path); +} + +arrow::Result> GetDatasetFromDirectory( + std::shared_ptr fs, std::shared_ptr format, + std::string dir) { + // Find all files under `path` + fs::FileSelector s; + s.base_dir = dir; + s.recursive = true; + + ds::FileSystemFactoryOptions options; + // The factory will try to build a child dataset. + ARROW_ASSIGN_OR_RAISE(auto factory, + ds::FileSystemDatasetFactory::Make(fs, s, format, options)); + + // Try to infer a common schema for all files. + ARROW_ASSIGN_OR_RAISE(auto schema, factory->Inspect({})); + // Caller can optionally decide another schema as long as it is compatible + // with the previous one, e.g. `factory->Finish(compatible_schema)`. + ARROW_ASSIGN_OR_RAISE(auto child, factory->Finish()); + + ds::DatasetVector children{1, child}; + auto dataset = ds::UnionDataset::Make(std::move(schema), std::move(children)); + + return dataset; +} + +arrow::Result> GetScannerFromDataset( + std::shared_ptr dataset) { + ARROW_ASSIGN_OR_RAISE(auto scanner_builder, dataset->NewScan()); + + ARROW_RETURN_NOT_OK(scanner_builder->UseThreads(true)); + + return scanner_builder->Finish(); +} + +TEST(GH41813, GH41813) { + std::string uri = + "file:///Users/zanmato/Downloads/arrow_segfault_reproducer_2/data/reduced_attempt3"; + std::string path; + auto format = std::make_shared(); + ASSERT_OK_AND_ASSIGN(auto fs, GetFileSystemFromUri(uri, &path)); + ASSERT_OK_AND_ASSIGN(auto dataset, GetDatasetFromDirectory(fs, format, path)); + + ASSERT_OK_AND_ASSIGN(auto scanner, GetScannerFromDataset(dataset)); + + ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable()); + std::cout << "Table size: " << table->num_rows() << "\n"; +} + } // namespace dataset } // namespace arrow