Skip to content

Commit

Permalink
GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset field and…
Browse files Browse the repository at this point in the history
… no longer write Metadata at end of Chunk (#43428)

### Rationale for this change

See github issue

### What changes are included in this PR?

Force `ColumnChunk::file_offset` tobe 0

### Are these changes tested?

No

### Are there any user-facing changes?

Might affect user using legacy `ColumnChunk::file_offset`

* GitHub Issue: #43427

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
  • Loading branch information
mapleFU authored Aug 7, 2024
1 parent b852b72 commit 9b58454
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 11 deletions.
2 changes: 1 addition & 1 deletion c_glib/test/parquet/test-column-chunk-metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def setup

test("#file_offset") do
assert do
@metadata.file_offset > 0
@metadata.file_offset == 0
end
end

Expand Down
5 changes: 0 additions & 5 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,6 @@ class SerializedPageWriter : public PageWriter {
total_compressed_size_, total_uncompressed_size_, has_dictionary,
fallback, dict_encoding_stats_, data_encoding_stats_,
meta_encryptor_);
// Write metadata at end of column chunk
metadata_->WriteTo(sink_.get());
}

/**
Expand Down Expand Up @@ -667,9 +665,6 @@ class BufferedPageWriter : public PageWriter {
has_dictionary, fallback, pager_->dict_encoding_stats_,
pager_->data_encoding_stats_, pager_->meta_encryptor_);

// Write metadata at end of column chunk
metadata_->WriteTo(in_memory_sink_.get());

// Buffered page writer needs to adjust page offsets.
pager_->FinishPageIndexes(final_position);

Expand Down
6 changes: 3 additions & 3 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1536,10 +1536,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
const std::shared_ptr<Encryptor>& encryptor) {
if (dictionary_page_offset > 0) {
column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
} else {
column_chunk_->__set_file_offset(data_page_offset + compressed_size);
}
// The `file_offset` field is deprecated and should be set to 0.
// See https://github.com/apache/parquet-format/pull/440 for detail.
column_chunk_->__set_file_offset(0);
column_chunk_->__isset.meta_data = true;
column_chunk_->meta_data.__set_num_values(num_values);
if (index_page_offset >= 0) {
Expand Down
7 changes: 6 additions & 1 deletion cpp/src/parquet/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,12 @@ class PARQUET_EXPORT ColumnChunkMetaData {

bool Equals(const ColumnChunkMetaData& other) const;

// column chunk
// Byte offset of `ColumnMetaData` in `file_path()`.
//
// Note that the meaning of this field has been inconsistent among implementations
// so its use has since been deprecated in the Parquet specification. Modern
// implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
// contained in the `ColumnChunk` struct.
int64_t file_offset() const;

// parameter is only used when a dataset is spread across multiple files
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_parquet_metadata_api():
col_meta = rg_meta.column(ncols + 2)

col_meta = rg_meta.column(0)
assert col_meta.file_offset > 0
assert col_meta.file_offset == 0
assert col_meta.file_path == '' # created from BytesIO
assert col_meta.physical_type == 'BOOLEAN'
assert col_meta.num_values == 10000
Expand Down

0 comments on commit 9b58454

Please sign in to comment.