From 9986b7b10618e0da2d8533e7e440c70f3fec7337 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 12 Sep 2024 10:42:22 +0200 Subject: [PATCH] GH-44072: [C++][Parquet] Add Float16 reading benchmarks (#44073) Local benchmark numbers: ``` --------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------------- BM_ReadColumnPlain/null_probability:-1 20038480 ns 20019703 ns 36 bytes_per_second=1.9512Gi/s items_per_second=523.772M/s BM_ReadColumnPlain/null_probability:0 37114403 ns 36766588 ns 19 bytes_per_second=1.06245Gi/s items_per_second=285.198M/s BM_ReadColumnPlain/null_probability:1 44589582 ns 44371707 ns 16 bytes_per_second=901.475Mi/s items_per_second=236.316M/s BM_ReadColumnPlain/null_probability:50 65624754 ns 65322683 ns 11 bytes_per_second=612.345Mi/s items_per_second=160.522M/s BM_ReadColumnPlain/null_probability:99 43072631 ns 42932582 ns 16 bytes_per_second=931.693Mi/s items_per_second=244.238M/s BM_ReadColumnPlain/null_probability:100 36710045 ns 36475141 ns 19 bytes_per_second=1.07093Gi/s items_per_second=287.477M/s BM_ReadColumnPlain/null_probability:-1 52718868 ns 52616204 ns 12 bytes_per_second=380.111Mi/s items_per_second=199.288M/s BM_ReadColumnPlain/null_probability:0 71273144 ns 71093105 ns 10 bytes_per_second=281.321Mi/s items_per_second=147.493M/s BM_ReadColumnPlain/null_probability:1 80674727 ns 80358048 ns 8 bytes_per_second=248.886Mi/s items_per_second=130.488M/s BM_ReadColumnPlain/null_probability:50 138249159 ns 137922632 ns 5 bytes_per_second=145.009Mi/s items_per_second=76.0264M/s BM_ReadColumnPlain/null_probability:99 86938382 ns 86576176 ns 8 bytes_per_second=231.01Mi/s items_per_second=121.116M/s BM_ReadColumnPlain/null_probability:100 74154244 ns 73984356 ns 9 bytes_per_second=270.327Mi/s items_per_second=141.729M/s ``` * GitHub Issue: #44072 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../parquet/arrow/reader_writer_benchmark.cc | 86 ++++++++++++++++--- 1 file changed, 75 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 95c4a659297d9..b12f234f72bdf 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -28,6 +28,7 @@ #include "parquet/file_reader.h" #include "parquet/file_writer.h" #include "parquet/platform.h" +#include "parquet/properties.h" #include "arrow/array.h" #include "arrow/array/builder_primitive.h" @@ -88,6 +89,11 @@ struct benchmark_traits { using arrow_type = ::arrow::BooleanType; }; +template <> +struct benchmark_traits { + using arrow_type = ::arrow::HalfFloatType; +}; + template using ArrowType = typename benchmark_traits::arrow_type; @@ -125,15 +131,15 @@ std::vector RandomVector(int64_t true_percentage, int64_t vector_size, return values; } -template +template > std::shared_ptr<::arrow::Table> TableFromVector( - const std::vector& vec, bool nullable, + const std::vector& vec, bool nullable, int64_t null_percentage = kAlternatingOrNa) { if (!nullable) { ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa); } - std::shared_ptr<::arrow::DataType> type = std::make_shared>(); - NumericBuilder> builder; + std::shared_ptr<::arrow::DataType> type = std::make_shared(); + NumericBuilder builder; if (nullable) { // Note true values select index 1 of sample_values auto valid_bytes = RandomVector(/*true_percentage=*/null_percentage, @@ -258,18 +264,20 @@ struct Examples { }; static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, + std::shared_ptr properties, int64_t num_values = -1, int64_t total_bytes = -1) { auto output = CreateOutputStream(); - EXIT_NOT_OK( - WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows())); + EXIT_NOT_OK(WriteTable(table, ::arrow::default_memory_pool(), output, + /*chunk_size=*/table.num_rows(), properties)); PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish()); - while (state.KeepRunning()) { + for (auto _ : state) { auto reader = ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); std::unique_ptr arrow_reader; EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &arrow_reader)); + std::shared_ptr<::arrow::Table> table; EXIT_NOT_OK(arrow_reader->ReadTable(&table)); } @@ -283,8 +291,14 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& } } +static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, + int64_t num_values = -1, int64_t total_bytes = -1) { + BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes); +} + static void BenchmarkReadArray(::benchmark::State& state, const std::shared_ptr& array, bool nullable, + std::shared_ptr properties, int64_t num_values = -1, int64_t total_bytes = -1) { auto schema = ::arrow::schema({field("s", array->type(), nullable)}); auto table = ::arrow::Table::Make(schema, {array}, array->length()); @@ -294,8 +308,15 @@ static void BenchmarkReadArray(::benchmark::State& state, BenchmarkReadTable(state, *table, num_values, total_bytes); } +static void BenchmarkReadArray(::benchmark::State& state, + const std::shared_ptr& array, bool nullable, + int64_t num_values = -1, int64_t total_bytes = -1) { + BenchmarkReadArray(state, array, nullable, default_writer_properties(), num_values, + total_bytes); +} + // -// Benchmark reading a primitive column +// Benchmark reading a dict-encoded primitive column // template @@ -308,7 +329,9 @@ static void BM_ReadColumn(::benchmark::State& state) { std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable, state.range(0)); - BenchmarkReadTable(state, *table, table->num_rows(), + auto properties = WriterProperties::Builder().disable_dictionary()->build(); + + BenchmarkReadTable(state, *table, properties, table->num_rows(), sizeof(typename ParquetType::c_type) * table->num_rows()); } @@ -316,8 +339,9 @@ static void BM_ReadColumn(::benchmark::State& state) { // null_percentage governs distribution and therefore runs of null values. // first_value_percentage governs distribution of values (we select from 1 of 2) // so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100) -// there will be some percentage of RLE encoded values and some percentage of literal -// encoded values (RLE is much less likely with percentages close to 50). +// there will be some percentage of RLE-encoded dictionary indices and some +// percentage of literal encoded dictionary indices +// (RLE is much less likely with percentages close to 50). BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) ->Args({/*null_percentage=*/kAlternatingOrNa, 1}) ->Args({/*null_percentage=*/kAlternatingOrNa, 10}) @@ -325,6 +349,7 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type) ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) + ->Args({/*null_percentage=*/0, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10}) ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5}) @@ -369,6 +394,45 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) ->Args({kAlternatingOrNa, 1}) ->Args({5, 10}); +// +// Benchmark reading a PLAIN-encoded primitive column +// + +template +static void BM_ReadColumnPlain(::benchmark::State& state) { + using c_type = typename ArrowType::c_type; + + const std::vector values(BENCHMARK_SIZE, static_cast(42)); + std::shared_ptr<::arrow::Table> table = + TableFromVector(values, /*nullable=*/nullable, state.range(0)); + + auto properties = WriterProperties::Builder().disable_dictionary()->build(); + BenchmarkReadTable(state, *table, properties, table->num_rows(), + sizeof(c_type) * table->num_rows()); +} + +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type) + ->ArgNames({"null_probability"}) + ->Args({kAlternatingOrNa}); +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Int32Type) + ->ArgNames({"null_probability"}) + ->Args({0}) + ->Args({1}) + ->Args({50}) + ->Args({99}) + ->Args({100}); + +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Float16LogicalType) + ->ArgNames({"null_probability"}) + ->Args({kAlternatingOrNa}); +BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType) + ->ArgNames({"null_probability"}) + ->Args({0}) + ->Args({1}) + ->Args({50}) + ->Args({99}) + ->Args({100}); + // // Benchmark reading binary column //