diff --git a/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs b/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs index 3d79f72654e8..30a606d6108a 100644 --- a/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs @@ -21,9 +21,12 @@ mod serde_types { use serde_types::*; /// Metadata for a column chunk. -// This contains the `ColumnDescriptor` associated with the chunk so that deserializers have -// access to the descriptor (e.g. physical, converted, logical). -#[derive(Debug, Clone)] +/// +/// This contains the `ColumnDescriptor` associated with the chunk so that deserializers have +/// access to the descriptor (e.g. physical, converted, logical). +/// +/// This struct is intentionally not `Clone`, as it is a huge struct. +#[derive(Debug)] #[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] pub struct ColumnChunkMetadata { #[cfg_attr( diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs index 7ae449c64d90..a7ffd6f7ba6d 100644 --- a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -10,7 +10,7 @@ pub use crate::parquet::thrift_format::KeyValue; /// Metadata for a Parquet file. // This is almost equal to [`parquet_format_safe::FileMetaData`] but contains the descriptors, // which are crucial to deserialize pages. -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct FileMetaData { /// version of this file. pub version: i32, @@ -87,25 +87,6 @@ impl FileMetaData { column_orders, }) } - - /// Serializes itself to thrift's [`parquet_format_safe::FileMetaData`]. - pub fn into_thrift(self) -> parquet_format_safe::FileMetaData { - parquet_format_safe::FileMetaData { - version: self.version, - schema: self.schema_descr.into_thrift(), - num_rows: self.num_rows as i64, - row_groups: self - .row_groups - .into_iter() - .map(|v| v.into_thrift()) - .collect(), - key_value_metadata: self.key_value_metadata, - created_by: self.created_by, - column_orders: None, // todo - encryption_algorithm: None, - footer_signing_key_metadata: None, - } - } } /// Parses [`ColumnOrder`] from Thrift definition. diff --git a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs index 08b1f8129e61..717bc7e243d8 100644 --- a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use hashbrown::hash_map::RawEntryMut; use parquet_format_safe::RowGroup; use polars_utils::aliases::{InitHashMaps, PlHashMap}; @@ -8,7 +10,6 @@ use polars_utils::unitvec; use super::column_chunk_metadata::{column_metadata_byte_range, ColumnChunkMetadata}; use super::schema_descriptor::SchemaDescriptor; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::write::ColumnOffsetsMetadata; type ColumnLookup = PlHashMap>; @@ -34,9 +35,8 @@ impl InitColumnLookup for ColumnLookup { /// Metadata for a row group. #[derive(Debug, Clone, Default)] -// #[cfg_attr(feature = "serde_types", derive(serde::Deserialize, serde::Serialize))] pub struct RowGroupMetaData { - columns: Vec, + columns: Arc<[ColumnChunkMetadata]>, column_lookup: PlHashMap>, num_rows: usize, total_byte_size: usize, @@ -125,7 +125,7 @@ impl RowGroupMetaData { Ok(column) }) - .collect::>>()?; + .collect::>>()?; Ok(RowGroupMetaData { columns, @@ -135,26 +135,4 @@ impl RowGroupMetaData { full_byte_range, }) } - - /// Method to convert to Thrift. - pub(crate) fn into_thrift(self) -> RowGroup { - let file_offset = self - .columns - .iter() - .map(|c| { - ColumnOffsetsMetadata::from_column_chunk_metadata(c).calc_row_group_file_offset() - }) - .next() - .unwrap_or(None); - let total_compressed_size = Some(self.compressed_size() as i64); - RowGroup { - columns: self.columns.into_iter().map(|v| v.into_thrift()).collect(), - total_byte_size: self.total_byte_size as i64, - num_rows: self.num_rows as i64, - sorting_columns: None, - file_offset, - total_compressed_size, - ordinal: None, - } - } } diff --git a/crates/polars-parquet/src/parquet/read/mod.rs b/crates/polars-parquet/src/parquet/read/mod.rs index 4be985b74f3b..c3ec112e6864 100644 --- a/crates/polars-parquet/src/parquet/read/mod.rs +++ b/crates/polars-parquet/src/parquet/read/mod.rs @@ -19,24 +19,7 @@ use polars_utils::mmap::MemReader; pub use stream::read_metadata as read_metadata_async; use crate::parquet::error::ParquetResult; -use crate::parquet::metadata::{ColumnChunkMetadata, FileMetaData, RowGroupMetaData}; - -/// Filters row group metadata to only those row groups, -/// for which the predicate function returns true -pub fn filter_row_groups( - metadata: &FileMetaData, - predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool, -) -> FileMetaData { - let mut filtered_row_groups = Vec::::new(); - for (i, row_group_metadata) in metadata.row_groups.iter().enumerate() { - if predicate(row_group_metadata, i) { - filtered_row_groups.push(row_group_metadata.clone()); - } - } - let mut metadata = metadata.clone(); - metadata.row_groups = filtered_row_groups; - metadata -} +use crate::parquet::metadata::ColumnChunkMetadata; /// Returns a new [`PageReader`] by seeking `reader` to the beginning of `column_chunk`. pub fn get_page_iterator(