Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-44072: [C++][Parquet] Add Float16 reading benchmarks #44073

Merged
merged 1 commit into from
Sep 12, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 75 additions & 11 deletions cpp/src/parquet/arrow/reader_writer_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "parquet/file_reader.h"
#include "parquet/file_writer.h"
#include "parquet/platform.h"
#include "parquet/properties.h"

#include "arrow/array.h"
#include "arrow/array/builder_primitive.h"
Expand Down Expand Up @@ -88,6 +89,11 @@ struct benchmark_traits<BooleanType> {
using arrow_type = ::arrow::BooleanType;
};

template <>
struct benchmark_traits<Float16LogicalType> {
using arrow_type = ::arrow::HalfFloatType;
};

template <typename ParquetType>
using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;

Expand Down Expand Up @@ -125,15 +131,15 @@ std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
return values;
}

template <typename ParquetType>
template <typename ParquetType, typename ArrowType = ArrowType<ParquetType>>
std::shared_ptr<::arrow::Table> TableFromVector(
const std::vector<typename ParquetType::c_type>& vec, bool nullable,
const std::vector<typename ArrowType::c_type>& vec, bool nullable,
int64_t null_percentage = kAlternatingOrNa) {
if (!nullable) {
ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
}
std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
NumericBuilder<ArrowType<ParquetType>> builder;
std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType>();
NumericBuilder<ArrowType> builder;
if (nullable) {
// Note true values select index 1 of sample_values
auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
Expand Down Expand Up @@ -258,18 +264,20 @@ struct Examples<bool> {
};

static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
std::shared_ptr<WriterProperties> properties,
int64_t num_values = -1, int64_t total_bytes = -1) {
auto output = CreateOutputStream();
EXIT_NOT_OK(
WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
EXIT_NOT_OK(WriteTable(table, ::arrow::default_memory_pool(), output,
/*chunk_size=*/table.num_rows(), properties));
PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());

while (state.KeepRunning()) {
for (auto _ : state) {
auto reader =
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
std::unique_ptr<FileReader> arrow_reader;
EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
&arrow_reader));

std::shared_ptr<::arrow::Table> table;
EXIT_NOT_OK(arrow_reader->ReadTable(&table));
}
Expand All @@ -283,8 +291,14 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
}
}

static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
int64_t num_values = -1, int64_t total_bytes = -1) {
BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes);
}

static void BenchmarkReadArray(::benchmark::State& state,
const std::shared_ptr<Array>& array, bool nullable,
std::shared_ptr<WriterProperties> properties,
int64_t num_values = -1, int64_t total_bytes = -1) {
auto schema = ::arrow::schema({field("s", array->type(), nullable)});
auto table = ::arrow::Table::Make(schema, {array}, array->length());
Expand All @@ -294,8 +308,15 @@ static void BenchmarkReadArray(::benchmark::State& state,
BenchmarkReadTable(state, *table, num_values, total_bytes);
}

static void BenchmarkReadArray(::benchmark::State& state,
const std::shared_ptr<Array>& array, bool nullable,
int64_t num_values = -1, int64_t total_bytes = -1) {
BenchmarkReadArray(state, array, nullable, default_writer_properties(), num_values,
total_bytes);
}

//
// Benchmark reading a primitive column
// Benchmark reading a dict-encoded primitive column
//

template <bool nullable, typename ParquetType>
Expand All @@ -308,23 +329,27 @@ static void BM_ReadColumn(::benchmark::State& state) {
std::shared_ptr<::arrow::Table> table =
TableFromVector<ParquetType>(values, nullable, state.range(0));

BenchmarkReadTable(state, *table, table->num_rows(),
auto properties = WriterProperties::Builder().disable_dictionary()->build();

BenchmarkReadTable(state, *table, properties, table->num_rows(),
sizeof(typename ParquetType::c_type) * table->num_rows());
}

// There are two parameters here that cover different data distributions.
// null_percentage governs distribution and therefore runs of null values.
// first_value_percentage governs distribution of values (we select from 1 of 2)
// so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
// there will be some percentage of RLE encoded values and some percentage of literal
// encoded values (RLE is much less likely with percentages close to 50).
// there will be some percentage of RLE-encoded dictionary indices and some
// percentage of literal encoded dictionary indices
// (RLE is much less likely with percentages close to 50).
BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
->Args({/*null_percentage=*/kAlternatingOrNa, 1})
->Args({/*null_percentage=*/kAlternatingOrNa, 10})
->Args({/*null_percentage=*/kAlternatingOrNa, 50});

BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
->Args({/*null_percentage=*/0, /*first_value_percentage=*/1})
->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
Expand Down Expand Up @@ -369,6 +394,45 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});

//
// Benchmark reading a PLAIN-encoded primitive column
//

template <bool nullable, typename ParquetType>
static void BM_ReadColumnPlain(::benchmark::State& state) {
using c_type = typename ArrowType<ParquetType>::c_type;

const std::vector<c_type> values(BENCHMARK_SIZE, static_cast<c_type>(42));
std::shared_ptr<::arrow::Table> table =
TableFromVector<ParquetType>(values, /*nullable=*/nullable, state.range(0));

auto properties = WriterProperties::Builder().disable_dictionary()->build();
BenchmarkReadTable(state, *table, properties, table->num_rows(),
sizeof(c_type) * table->num_rows());
}

BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit curiousity why benchmarking int32, it's to comparing with Float32?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and because the other benchmarks are dict-encoded.

->ArgNames({"null_probability"})
->Args({kAlternatingOrNa});
BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Int32Type)
->ArgNames({"null_probability"})
->Args({0})
->Args({1})
->Args({50})
->Args({99})
->Args({100});

BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Float16LogicalType)
->ArgNames({"null_probability"})
->Args({kAlternatingOrNa});
BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType)
->ArgNames({"null_probability"})
->Args({0})
->Args({1})
->Args({50})
->Args({99})
->Args({100});

//
// Benchmark reading binary column
//
Expand Down
Loading