Skip to content

Commit

Permalink
Add integration test
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Mar 18, 2024
1 parent b8bd382 commit 94626c4
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 1 deletion.
71 changes: 71 additions & 0 deletions cpp/src/parquet/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <iostream>
#include <memory>
#include <string>
#include <utility>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
Expand Down Expand Up @@ -123,6 +124,10 @@ std::string concatenated_gzip_members() {

std::string byte_stream_split() { return data_file("byte_stream_split.zstd.parquet"); }

std::string byte_stream_split_extended() {
return data_file("byte_stream_split_extended.gzip.parquet");
}

template <typename DType, typename ValueType = typename DType::c_type>
std::vector<ValueType> ReadColumnValues(ParquetFileReader* file_reader, int row_group,
int column, int64_t expected_values_read) {
Expand Down Expand Up @@ -154,6 +159,44 @@ void AssertColumnValues(std::shared_ptr<TypedColumnReader<DType>> col, int64_t b
ASSERT_EQ(expected_values_read, values_read);
}

template <typename DType, typename ValueType = typename DType::c_type>
void AssertColumnValuesEqual(std::shared_ptr<TypedColumnReader<DType>> left_col,
std::shared_ptr<TypedColumnReader<DType>> right_col,
int64_t batch_size, int64_t expected_levels_read,
int64_t expected_values_read) {
std::vector<ValueType> left_values(batch_size);
std::vector<ValueType> right_values(batch_size);
int64_t values_read, levels_read;

levels_read =
left_col->ReadBatch(batch_size, nullptr, nullptr, left_values.data(), &values_read);
ASSERT_EQ(expected_levels_read, levels_read);
ASSERT_EQ(expected_values_read, values_read);

levels_read = right_col->ReadBatch(batch_size, nullptr, nullptr, right_values.data(),
&values_read);
ASSERT_EQ(expected_levels_read, levels_read);
ASSERT_EQ(expected_values_read, values_read);

ASSERT_EQ(left_values, right_values);
}

template <typename DType, typename ValueType = typename DType::c_type>
void AssertColumnValuesEqual(ParquetFileReader* file_reader, const std::string& left_col,
const std::string& right_col, int64_t num_rows,
int64_t row_group = 0) {
ARROW_SCOPED_TRACE("left_col = '", left_col, "', right_col = '", right_col, "'");

auto left_col_index = file_reader->metadata()->schema()->ColumnIndex(left_col);
auto right_col_index = file_reader->metadata()->schema()->ColumnIndex(right_col);
auto row_group_reader = file_reader->RowGroup(row_group);
auto left_reader = checked_pointer_cast<TypedColumnReader<DType>>(
row_group_reader->Column(left_col_index));
auto right_reader = checked_pointer_cast<TypedColumnReader<DType>>(
row_group_reader->Column(right_col_index));
AssertColumnValuesEqual(left_reader, right_reader, num_rows, num_rows, num_rows);
}

void CheckRowGroupMetadata(const RowGroupMetaData* rg_metadata,
bool allow_uncompressed_mismatch = false) {
const int64_t total_byte_size = rg_metadata->total_byte_size();
Expand Down Expand Up @@ -1522,6 +1565,34 @@ TEST(TestByteStreamSplit, FloatIntegrationFile) {
}
#endif // ARROW_WITH_ZSTD

#ifdef ARROW_WITH_ZLIB
TEST(TestByteStreamSplit, ExtendedIntegrationFile) {
auto file_path = byte_stream_split_extended();
auto file = ParquetFileReader::OpenFile(file_path);

const int64_t kNumRows = 200;

ASSERT_EQ(kNumRows, file->metadata()->num_rows());
ASSERT_EQ(14, file->metadata()->num_columns());
ASSERT_EQ(1, file->metadata()->num_row_groups());

AssertColumnValuesEqual<FloatType>(file.get(), "float_plain", "float_byte_stream_split",
kNumRows);
AssertColumnValuesEqual<DoubleType>(file.get(), "double_plain",
"double_byte_stream_split", kNumRows);
AssertColumnValuesEqual<Int32Type>(file.get(), "int32_plain", "int32_byte_stream_split",
kNumRows);
AssertColumnValuesEqual<Int64Type>(file.get(), "int64_plain", "int64_byte_stream_split",
kNumRows);
AssertColumnValuesEqual<FLBAType>(file.get(), "float16_plain",
"float16_byte_stream_split", kNumRows);
AssertColumnValuesEqual<FLBAType>(file.get(), "flba5_plain", "flba5_byte_stream_split",
kNumRows);
AssertColumnValuesEqual<FLBAType>(file.get(), "decimal_plain",
"decimal_byte_stream_split", kNumRows);
}
#endif // ARROW_WITH_ZLIB

struct PageIndexReaderParam {
std::vector<int32_t> row_group_indices;
std::vector<int32_t> column_indices;
Expand Down
2 changes: 1 addition & 1 deletion cpp/submodules/parquet-testing

0 comments on commit 94626c4

Please sign in to comment.