diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index d8e09f04ba83..ad694ec60399 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -128,8 +128,4 @@ jobs: rustup component add clippy - name: Run clippy run: | - # Only run clippy for the library at this time, - # as there are clippy errors for other targets - cargo clippy -p parquet --all-features --lib -- -D warnings - # https://github.com/apache/arrow-rs/issues/1254 - #cargo clippy -p parquet --all-targets --all-features -- -D warnings + cargo clippy -p parquet --all-targets --all-features -- -D warnings diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index d9c1bedb246c..32ffaeb9d5cc 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -104,13 +104,11 @@ fn build_list_reader( let data_type = field.arrow_type.clone(); let item_reader = build_reader(&children[0], row_groups)?; - let item_type = item_reader.get_data_type().clone(); match is_large { false => Ok(Box::new(ListArrayReader::::new( item_reader, data_type, - item_type, field.def_level, field.rep_level, field.nullable, @@ -118,7 +116,6 @@ fn build_list_reader( true => Ok(Box::new(ListArrayReader::::new( item_reader, data_type, - item_type, field.def_level, field.rep_level, field.nullable, @@ -318,7 +315,7 @@ mod tests { use super::*; use crate::arrow::parquet_to_arrow_schema; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; use arrow::datatypes::Field; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 94770794cb73..2acd59dcc24f 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -34,7 +34,6 @@ use std::sync::Arc; pub struct ListArrayReader { item_reader: Box, data_type: ArrowType, - item_type: ArrowType, /// The definition level at which this list is not null def_level: i16, /// The repetition level that corresponds to a new value in this array @@ -49,7 +48,6 @@ impl ListArrayReader { pub fn new( item_reader: Box, data_type: ArrowType, - item_type: ArrowType, def_level: i16, rep_level: i16, nullable: bool, @@ -57,7 +55,6 @@ impl ListArrayReader { Self { item_reader, data_type, - item_type, def_level, rep_level, nullable, @@ -304,13 +301,13 @@ mod tests { // ] let l3_item_type = ArrowType::Int32; - let l3_type = list_type::(l3_item_type.clone(), true); + let l3_type = list_type::(l3_item_type, true); let l2_item_type = l3_type.clone(); - let l2_type = list_type::(l2_item_type.clone(), true); + let l2_type = list_type::(l2_item_type, true); let l1_item_type = l2_type.clone(); - let l1_type = list_type::(l1_item_type.clone(), false); + let l1_type = list_type::(l1_item_type, false); let leaf = PrimitiveArray::::from_iter(vec![ Some(1), @@ -387,7 +384,6 @@ mod tests { let l3 = ListArrayReader::::new( Box::new(item_array_reader), l3_type, - l3_item_type, 5, 3, true, @@ -396,7 +392,6 @@ mod tests { let l2 = ListArrayReader::::new( Box::new(l3), l2_type, - l2_item_type, 3, 2, false, @@ -405,7 +400,6 @@ mod tests { let mut l1 = ListArrayReader::::new( Box::new(l2), l1_type, - l1_item_type, 2, 1, true, @@ -456,7 +450,6 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), list_type::(ArrowType::Int32, true), - ArrowType::Int32, 1, 1, false, @@ -509,7 +502,6 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), list_type::(ArrowType::Int32, true), - ArrowType::Int32, 2, 1, true, diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 83ba63ca1706..3ba7f6960ec3 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -32,6 +32,7 @@ pub struct MapArrayReader { value_reader: Box, data_type: ArrowType, map_def_level: i16, + #[allow(unused)] map_rep_level: i16, } @@ -47,6 +48,7 @@ impl MapArrayReader { key_reader, value_reader, data_type, + // These are the wrong way round https://github.com/apache/arrow-rs/issues/1699 map_def_level: rep_level, map_rep_level: def_level, } diff --git a/parquet/src/arrow/array_reader/null_array.rs b/parquet/src/arrow/array_reader/null_array.rs index 682d15f8a177..405633f0a823 100644 --- a/parquet/src/arrow/array_reader/null_array.rs +++ b/parquet/src/arrow/array_reader/null_array.rs @@ -39,7 +39,6 @@ where pages: Box, def_levels_buffer: Option, rep_levels_buffer: Option, - column_desc: ColumnDescPtr, record_reader: RecordReader, } @@ -50,14 +49,13 @@ where { /// Construct null array reader. pub fn new(pages: Box, column_desc: ColumnDescPtr) -> Result { - let record_reader = RecordReader::::new(column_desc.clone()); + let record_reader = RecordReader::::new(column_desc); Ok(Self { data_type: ArrowType::Null, pages, def_levels_buffer: None, rep_levels_buffer: None, - column_desc, record_reader, }) } diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 45614d50941c..516a3f50c712 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -44,7 +44,6 @@ where pages: Box, def_levels_buffer: Option, rep_levels_buffer: Option, - column_desc: ColumnDescPtr, record_reader: RecordReader, } @@ -67,14 +66,13 @@ where .clone(), }; - let record_reader = RecordReader::::new(column_desc.clone()); + let record_reader = RecordReader::::new(column_desc); Ok(Self { data_type, pages, def_levels_buffer: None, rep_levels_buffer: None, - column_desc, record_reader, }) } @@ -244,7 +242,7 @@ mod tests { use crate::data_type::Int32Type; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; - use crate::util::test_common::make_pages; + use crate::util::test_common::rand_gen::make_pages; use crate::util::InMemoryPageIterator; use arrow::array::PrimitiveArray; use arrow::datatypes::ArrowPrimitiveType; @@ -252,6 +250,7 @@ mod tests { use rand::distributions::uniform::SampleUniform; use std::collections::VecDeque; + #[allow(clippy::too_many_arguments)] fn make_column_chunks( column_desc: ColumnDescPtr, encoding: Encoding, diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index b333c66cb213..f682f146c721 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -314,7 +314,6 @@ mod tests { let list_reader = ListArrayReader::::new( Box::new(reader), expected_l.data_type().clone(), - ArrowType::Int32, 3, 1, true, diff --git a/parquet/src/arrow/array_reader/test_util.rs b/parquet/src/arrow/array_reader/test_util.rs index da9b8d3bf9b2..ca1aabfd4aa1 100644 --- a/parquet/src/arrow/array_reader/test_util.rs +++ b/parquet/src/arrow/array_reader/test_util.rs @@ -48,8 +48,7 @@ pub fn utf8_column() -> ColumnDescPtr { /// Encode `data` with the provided `encoding` pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) -> ByteBufferPtr { - let descriptor = utf8_column(); - let mut encoder = get_encoder::(descriptor, encoding).unwrap(); + let mut encoder = get_encoder::(encoding).unwrap(); encoder.put(data).unwrap(); encoder.flush_buffer().unwrap() diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader.rs index 594b416761bb..726c200606fc 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader.rs @@ -85,6 +85,7 @@ pub(crate) struct RowSelection { impl RowSelection { /// Select `row_count` rows + #[allow(unused)] pub fn select(row_count: usize) -> Self { Self { row_count, @@ -93,6 +94,7 @@ impl RowSelection { } /// Skip `row_count` rows + #[allow(unused)] pub fn skip(row_count: usize) -> Self { Self { row_count, @@ -109,7 +111,7 @@ pub struct ArrowReaderOptions { impl ArrowReaderOptions { /// Create a new [`ArrowReaderOptions`] with the default settings - fn new() -> Self { + pub fn new() -> Self { Self::default() } @@ -129,6 +131,7 @@ impl ArrowReaderOptions { /// Scan rows from the parquet file according to the provided `selection` /// /// TODO: Make public once row selection fully implemented (#1792) + #[allow(unused)] pub(crate) fn with_row_selection( self, selection: impl Into>, @@ -433,7 +436,7 @@ mod tests { use crate::file::writer::SerializedFileWriter; use crate::schema::parser::parse_message_type; use crate::schema::types::{Type, TypePtr}; - use crate::util::test_common::RandGen; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_arrow_reader_all_columns() { diff --git a/parquet/src/arrow/buffer/converter.rs b/parquet/src/arrow/buffer/converter.rs index 4cd0589424fc..d8cbd256a460 100644 --- a/parquet/src/arrow/buffer/converter.rs +++ b/parquet/src/arrow/buffer/converter.rs @@ -17,18 +17,21 @@ use crate::data_type::{ByteArray, FixedLenByteArray, Int96}; use arrow::array::{ - Array, ArrayRef, BasicDecimalArray, BinaryArray, BinaryBuilder, Decimal128Array, - FixedSizeBinaryArray, FixedSizeBinaryBuilder, IntervalDayTimeArray, - IntervalDayTimeBuilder, IntervalYearMonthArray, IntervalYearMonthBuilder, - LargeBinaryArray, LargeBinaryBuilder, LargeStringArray, LargeStringBuilder, - StringArray, StringBuilder, TimestampNanosecondArray, + Array, ArrayRef, BasicDecimalArray, Decimal128Array, FixedSizeBinaryArray, + FixedSizeBinaryBuilder, IntervalDayTimeArray, IntervalDayTimeBuilder, + IntervalYearMonthArray, IntervalYearMonthBuilder, TimestampNanosecondArray, }; -use std::convert::{From, TryInto}; use std::sync::Arc; use crate::errors::Result; use std::marker::PhantomData; +#[cfg(test)] +use arrow::array::{ + BinaryArray, BinaryBuilder, LargeStringArray, LargeStringBuilder, StringArray, + StringBuilder, +}; + /// A converter is used to consume record reader's content and convert it to arrow /// primitive array. pub trait Converter { @@ -185,8 +188,10 @@ impl Converter>, TimestampNanosecondArray> for Int96ArrayConve } } +#[cfg(test)] pub struct Utf8ArrayConverter {} +#[cfg(test)] impl Converter>, StringArray> for Utf8ArrayConverter { fn convert(&self, source: Vec>) -> Result { let data_size = source @@ -206,8 +211,10 @@ impl Converter>, StringArray> for Utf8ArrayConverter { } } +#[cfg(test)] pub struct LargeUtf8ArrayConverter {} +#[cfg(test)] impl Converter>, LargeStringArray> for LargeUtf8ArrayConverter { fn convert(&self, source: Vec>) -> Result { let data_size = source @@ -227,8 +234,10 @@ impl Converter>, LargeStringArray> for LargeUtf8ArrayConve } } +#[cfg(test)] pub struct BinaryArrayConverter {} +#[cfg(test)] impl Converter>, BinaryArray> for BinaryArrayConverter { fn convert(&self, source: Vec>) -> Result { let mut builder = BinaryBuilder::new(source.len()); @@ -243,33 +252,9 @@ impl Converter>, BinaryArray> for BinaryArrayConverter { } } -pub struct LargeBinaryArrayConverter {} - -impl Converter>, LargeBinaryArray> for LargeBinaryArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let mut builder = LargeBinaryBuilder::new(source.len()); - for v in source { - match v { - Some(array) => builder.append_value(array.data()), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - +#[cfg(test)] pub type Utf8Converter = ArrayRefConverter>, StringArray, Utf8ArrayConverter>; -pub type LargeUtf8Converter = - ArrayRefConverter>, LargeStringArray, LargeUtf8ArrayConverter>; -pub type BinaryConverter = - ArrayRefConverter>, BinaryArray, BinaryArrayConverter>; -pub type LargeBinaryConverter = ArrayRefConverter< - Vec>, - LargeBinaryArray, - LargeBinaryArrayConverter, ->; pub type Int96Converter = ArrayRefConverter>, TimestampNanosecondArray, Int96ArrayConverter>; @@ -299,11 +284,13 @@ pub type DecimalFixedLengthByteArrayConverter = ArrayRefConverter< pub type DecimalByteArrayConvert = ArrayRefConverter>, Decimal128Array, DecimalArrayConverter>; +#[cfg(test)] pub struct FromConverter { _source: PhantomData, _dest: PhantomData, } +#[cfg(test)] impl FromConverter where T: From, @@ -316,6 +303,7 @@ where } } +#[cfg(test)] impl Converter for FromConverter where T: From, diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index b64b2946b91a..ae9e3590de3f 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -49,6 +49,7 @@ impl Default for DictionaryBuffer { impl DictionaryBuffer { + #[allow(unused)] pub fn len(&self) -> usize { match self { Self::Dict { keys, .. } => keys.len(), diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index 60fe4cdc9c96..88d45f3d746a 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -214,6 +214,7 @@ where } /// Returns number of records stored in buffer. + #[allow(unused)] pub fn num_records(&self) -> usize { self.num_records } @@ -273,11 +274,6 @@ where .map(|levels| levels.split_bitmask(self.num_values)) } - /// Returns column reader. - pub(crate) fn column_reader(&self) -> Option<&ColumnReader> { - self.column_reader.as_ref() - } - /// Try to read one batch of data. fn read_one_batch(&mut self, batch_size: usize) -> Result { let rep_levels = self diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 2cb47bc00e7e..01aefcd48e1d 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -73,7 +73,7 @@ pub fn parquet_to_arrow_schema_by_columns( // Add the Arrow metadata to the Parquet metadata skipping keys that collide if let Some(arrow_schema) = &maybe_schema { arrow_schema.metadata().iter().for_each(|(k, v)| { - metadata.entry(k.clone()).or_insert(v.clone()); + metadata.entry(k.clone()).or_insert_with(|| v.clone()); }); } @@ -100,7 +100,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { Ok(message) => message .header_as_schema() .map(arrow::ipc::convert::fb_to_schema) - .ok_or(arrow_err!("the message is not Arrow Schema")), + .ok_or_else(|| arrow_err!("the message is not Arrow Schema")), Err(err) => { // The flatbuffers implementation returns an error on verification error. Err(arrow_err!( diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 0cf1d5121b7e..9c58f764cb2c 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -18,7 +18,7 @@ //! Contains Rust mappings for Thrift definition. //! Refer to `parquet.thrift` file to see raw definitions. -use std::{convert, fmt, result, str}; +use std::{fmt, result, str}; use parquet_format as parquet; @@ -42,6 +42,7 @@ pub use parquet_format::{ /// For example INT16 is not included as a type since a good encoding of INT32 /// would handle this. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[allow(non_camel_case_types)] pub enum Type { BOOLEAN, INT32, @@ -63,6 +64,7 @@ pub enum Type { /// This struct was renamed from `LogicalType` in version 4.0.0. /// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead. #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(non_camel_case_types)] pub enum ConvertedType { NONE, /// A BYTE_ARRAY actually contains UTF8 encoded chars. @@ -197,6 +199,7 @@ pub enum LogicalType { /// Representation of field types in schema. #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(non_camel_case_types)] pub enum Repetition { /// Field is required (can not be null) and each record has exactly 1 value. REQUIRED, @@ -213,6 +216,7 @@ pub enum Repetition { /// Not all encodings are valid for all types. These enums are also used to specify the /// encoding of definition and repetition levels. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] +#[allow(non_camel_case_types)] pub enum Encoding { /// Default byte encoding. /// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. @@ -294,6 +298,7 @@ pub enum Compression { /// Available data pages for Parquet file format. /// Note that some of the page types may not be supported. #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(non_camel_case_types)] pub enum PageType { DATA_PAGE, INDEX_PAGE, @@ -313,6 +318,7 @@ pub enum PageType { /// See reference in /// #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(non_camel_case_types)] pub enum SortOrder { /// Signed (either value or legacy byte-wise) comparison. SIGNED, @@ -328,6 +334,7 @@ pub enum SortOrder { /// If column order is undefined, then it is the legacy behaviour and all values should /// be compared as signed values/bytes. #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(non_camel_case_types)] pub enum ColumnOrder { /// Column uses the order defined by its logical or physical type /// (if there is no logical type), parquet-format 2.4.0+. @@ -489,7 +496,7 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl convert::From for Type { +impl From for Type { fn from(value: parquet::Type) -> Self { match value { parquet::Type::Boolean => Type::BOOLEAN, @@ -504,7 +511,7 @@ impl convert::From for Type { } } -impl convert::From for parquet::Type { +impl From for parquet::Type { fn from(value: Type) -> Self { match value { Type::BOOLEAN => parquet::Type::Boolean, @@ -522,7 +529,7 @@ impl convert::From for parquet::Type { // ---------------------------------------------------------------------- // parquet::ConvertedType <=> ConvertedType conversion -impl convert::From> for ConvertedType { +impl From> for ConvertedType { fn from(option: Option) -> Self { match option { None => ConvertedType::NONE, @@ -558,7 +565,7 @@ impl convert::From> for ConvertedType { } } -impl convert::From for Option { +impl From for Option { fn from(value: ConvertedType) -> Self { match value { ConvertedType::NONE => None, @@ -595,7 +602,7 @@ impl convert::From for Option { // ---------------------------------------------------------------------- // parquet::LogicalType <=> LogicalType conversion -impl convert::From for LogicalType { +impl From for LogicalType { fn from(value: parquet::LogicalType) -> Self { match value { parquet::LogicalType::STRING(_) => LogicalType::String, @@ -627,7 +634,7 @@ impl convert::From for LogicalType { } } -impl convert::From for parquet::LogicalType { +impl From for parquet::LogicalType { fn from(value: LogicalType) -> Self { match value { LogicalType::String => parquet::LogicalType::STRING(Default::default()), @@ -723,7 +730,7 @@ impl From> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl convert::From for Repetition { +impl From for Repetition { fn from(value: parquet::FieldRepetitionType) -> Self { match value { parquet::FieldRepetitionType::Required => Repetition::REQUIRED, @@ -733,7 +740,7 @@ impl convert::From for Repetition { } } -impl convert::From for parquet::FieldRepetitionType { +impl From for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { Repetition::REQUIRED => parquet::FieldRepetitionType::Required, @@ -746,7 +753,7 @@ impl convert::From for parquet::FieldRepetitionType { // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl convert::From for Encoding { +impl From for Encoding { fn from(value: parquet::Encoding) -> Self { match value { parquet::Encoding::Plain => Encoding::PLAIN, @@ -762,7 +769,7 @@ impl convert::From for Encoding { } } -impl convert::From for parquet::Encoding { +impl From for parquet::Encoding { fn from(value: Encoding) -> Self { match value { Encoding::PLAIN => parquet::Encoding::Plain, @@ -781,7 +788,7 @@ impl convert::From for parquet::Encoding { // ---------------------------------------------------------------------- // parquet::CompressionCodec <=> Compression conversion -impl convert::From for Compression { +impl From for Compression { fn from(value: parquet::CompressionCodec) -> Self { match value { parquet::CompressionCodec::Uncompressed => Compression::UNCOMPRESSED, @@ -795,7 +802,7 @@ impl convert::From for Compression { } } -impl convert::From for parquet::CompressionCodec { +impl From for parquet::CompressionCodec { fn from(value: Compression) -> Self { match value { Compression::UNCOMPRESSED => parquet::CompressionCodec::Uncompressed, @@ -812,7 +819,7 @@ impl convert::From for parquet::CompressionCodec { // ---------------------------------------------------------------------- // parquet::PageType <=> PageType conversion -impl convert::From for PageType { +impl From for PageType { fn from(value: parquet::PageType) -> Self { match value { parquet::PageType::DataPage => PageType::DATA_PAGE, @@ -823,7 +830,7 @@ impl convert::From for PageType { } } -impl convert::From for parquet::PageType { +impl From for parquet::PageType { fn from(value: PageType) -> Self { match value { PageType::DATA_PAGE => parquet::PageType::DataPage, diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index aa1d50563cd9..827aa7311f58 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -439,7 +439,7 @@ mod tests { // test default values assert_eq!(args.input_format, CsvDialect::Csv); assert_eq!(args.batch_size, 1000); - assert_eq!(args.has_header, false); + assert!(!args.has_header); assert_eq!(args.delimiter, None); assert_eq!(args.get_delimiter(), b','); assert_eq!(args.record_terminator, None); @@ -553,7 +553,7 @@ mod tests { Field::new("field5", DataType::Utf8, false), ])); - let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{:?}", reader_builder); assert_debug_text(&builder_debug, "has_header", "false"); assert_debug_text(&builder_debug, "delimiter", "Some(44)"); @@ -585,7 +585,7 @@ mod tests { Field::new("field4", DataType::Utf8, false), Field::new("field5", DataType::Utf8, false), ])); - let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{:?}", reader_builder); assert_debug_text(&builder_debug, "has_header", "true"); assert_debug_text(&builder_debug, "delimiter", "Some(9)"); diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index 0530afaa786a..927d96f8cde7 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -93,6 +93,6 @@ fn print_row(row: &Row, json: bool) { if json { println!("{}", row.to_json_value()) } else { - println!("{}", row.to_string()); + println!("{}", row); } } diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index b875b0e7102b..68c52def7c44 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -67,9 +67,9 @@ fn main() { println!("Metadata for file: {}", &filename); println!(); if verbose { - print_parquet_metadata(&mut std::io::stdout(), &metadata); + print_parquet_metadata(&mut std::io::stdout(), metadata); } else { - print_file_metadata(&mut std::io::stdout(), &metadata.file_metadata()); + print_file_metadata(&mut std::io::stdout(), metadata.file_metadata()); } } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index c61e9c0b343e..1658797cee7d 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -174,6 +174,12 @@ pub struct PageWriteSpec { pub bytes_written: u64, } +impl Default for PageWriteSpec { + fn default() -> Self { + Self::new() + } +} + impl PageWriteSpec { /// Creates new spec with default page write metrics. pub fn new() -> Self { diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 8e0fa5a4d5aa..8eee807c2bed 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -544,8 +544,8 @@ mod tests { use crate::basic::Type as PhysicalType; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::test_common::make_pages; use crate::util::test_common::page_util::InMemoryPageReader; + use crate::util::test_common::rand_gen::make_pages; const NUM_LEVELS: usize = 128; const NUM_PAGES: usize = 2; @@ -1231,6 +1231,7 @@ mod tests { // Helper function for the general case of `read_batch()` where `values`, // `def_levels` and `rep_levels` are always provided with enough space. + #[allow(clippy::too_many_arguments)] fn test_read_batch_general( &mut self, desc: ColumnDescPtr, @@ -1262,6 +1263,7 @@ mod tests { // Helper function to test `read_batch()` method with custom buffers for values, // definition and repetition levels. + #[allow(clippy::too_many_arguments)] fn test_read_batch( &mut self, desc: ColumnDescPtr, diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index d7363129f1ea..4fb4f210e146 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -168,7 +168,6 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { // Set either main encoder or fallback encoder. let encoder = get_encoder( - descr.clone(), props .encoding(descr.path()) .unwrap_or_else(|| fallback_encoding(T::get_physical_type(), props)), diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 20a3babd8aa5..669cacee6460 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -258,6 +258,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } + #[allow(clippy::too_many_arguments)] pub(crate) fn write_batch_internal( &mut self, values: &E::Values, @@ -907,12 +908,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { _ => {} } } - - /// Returns reference to the underlying page writer. - /// This method is intended to use in tests only. - fn get_page_writer_ref(&self) -> &dyn PageWriter { - self.page_writer.as_ref() - } } fn update_min( @@ -1101,7 +1096,7 @@ mod tests { writer::SerializedPageWriter, }; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::{io::FileSource, test_common::random_numbers_range}; + use crate::util::{io::FileSource, test_common::rand_gen::random_numbers_range}; use super::*; @@ -2408,20 +2403,6 @@ mod tests { get_typed_column_writer::(column_writer) } - /// Returns decimals column reader. - fn get_test_decimals_column_reader( - page_reader: Box, - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnReaderImpl { - let descr = Arc::new(get_test_decimals_column_descr::( - max_def_level, - max_rep_level, - )); - let column_reader = get_column_reader(descr, page_reader); - get_typed_column_reader::(column_reader) - } - /// Returns descriptor for Decimal type with primitive column. fn get_test_decimals_column_descr( max_def_level: i16, @@ -2456,20 +2437,6 @@ mod tests { get_typed_column_writer::(column_writer) } - /// Returns column reader for UINT32 Column provided as ConvertedType only - fn get_test_unsigned_int_given_as_converted_column_reader( - page_reader: Box, - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnReaderImpl { - let descr = Arc::new(get_test_converted_type_unsigned_integer_column_descr::( - max_def_level, - max_rep_level, - )); - let column_reader = get_column_reader(descr, page_reader); - get_typed_column_reader::(column_reader) - } - /// Returns column descriptor for UINT32 Column provided as ConvertedType only fn get_test_converted_type_unsigned_integer_column_descr( max_def_level: i16, diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index a5e49360a28a..ee5141cbe140 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -329,7 +329,7 @@ pub use zstd_codec::*; mod tests { use super::*; - use crate::util::test_common::*; + use crate::util::test_common::rand_gen::random_bytes; fn test_roundtrip(c: CodecType, data: &[u8]) { let mut c1 = create_codec(c).unwrap().unwrap(); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 43c9a4238a71..005b35570a32 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -574,8 +574,6 @@ pub(crate) mod private { use super::{ParquetError, Result, SliceAsBytes}; - pub type BitIndex = u64; - /// Sealed trait to start to remove specialisation from implementations /// /// This is done to force the associated value type to be unimplementable outside of this @@ -710,19 +708,6 @@ pub(crate) mod private { } } - /// Hopelessly unsafe function that emulates `num::as_ne_bytes` - /// - /// It is not recommended to use this outside of this private module as, while it - /// _should_ work for primitive values, it is little better than a transmutation - /// and can act as a backdoor into mis-interpreting types as arbitary byte slices - #[inline] - fn as_raw<'a, T>(value: *const T) -> &'a [u8] { - unsafe { - let value = value as *const u8; - std::slice::from_raw_parts(value, std::mem::size_of::()) - } - } - macro_rules! impl_from_raw { ($ty: ty, $physical_ty: expr, $self: ident => $as_i64: block) => { impl ParquetValueType for $ty { diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 58aa592d1424..3db050a0f43d 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -322,6 +322,12 @@ pub struct DictDecoder { num_values: usize, } +impl Default for DictDecoder { + fn default() -> Self { + Self::new() + } +} + impl DictDecoder { /// Creates new dictionary decoder. pub fn new() -> Self { @@ -394,6 +400,12 @@ pub struct RleValueDecoder { _phantom: PhantomData, } +impl Default for RleValueDecoder { + fn default() -> Self { + Self::new() + } +} + impl RleValueDecoder { pub fn new() -> Self { Self { @@ -485,6 +497,15 @@ pub struct DeltaBitPackDecoder { last_value: T::T, } +impl Default for DeltaBitPackDecoder +where + T::T: Default + FromPrimitive + WrappingAdd + Copy, +{ + fn default() -> Self { + Self::new() + } +} + impl DeltaBitPackDecoder where T::T: Default + FromPrimitive + WrappingAdd + Copy, @@ -706,8 +727,6 @@ where Ok(to_read) } - - fn values_left(&self) -> usize { self.values_left } @@ -751,6 +770,12 @@ pub struct DeltaLengthByteArrayDecoder { _phantom: PhantomData, } +impl Default for DeltaLengthByteArrayDecoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaLengthByteArrayDecoder { /// Creates new delta length byte array decoder. pub fn new() -> Self { @@ -829,7 +854,10 @@ impl Decoder for DeltaLengthByteArrayDecoder { Type::BYTE_ARRAY => { let num_values = cmp::min(num_values, self.num_values); - let next_offset: i32 = self.lengths[self.current_idx..self.current_idx + num_values].iter().sum(); + let next_offset: i32 = self.lengths + [self.current_idx..self.current_idx + num_values] + .iter() + .sum(); self.current_idx += num_values; self.offset += next_offset as usize; @@ -837,8 +865,9 @@ impl Decoder for DeltaLengthByteArrayDecoder { self.num_values -= num_values; Ok(num_values) } - other_type => Err(general_err!( - "DeltaLengthByteArrayDecoder not support {}, only support byte array", other_type + other_type => Err(general_err!( + "DeltaLengthByteArrayDecoder not support {}, only support byte array", + other_type )), } } @@ -874,6 +903,12 @@ pub struct DeltaByteArrayDecoder { _phantom: PhantomData, } +impl Default for DeltaByteArrayDecoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaByteArrayDecoder { /// Creates new delta byte array decoder. pub fn new() -> Self { @@ -990,7 +1025,7 @@ mod tests { use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; - use crate::util::{bit_util::set_array_bit, test_common::RandGen}; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_get_decoders() { @@ -1068,13 +1103,7 @@ mod tests { fn test_plain_skip_all_int32() { let data = vec![42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 5, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1096,7 +1125,6 @@ mod tests { ); } - #[test] fn test_plain_decode_int64() { let data = vec![42, 18, 52]; @@ -1128,16 +1156,9 @@ mod tests { fn test_plain_skip_all_int64() { let data = vec![42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 3, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 3, -1, &[]); } - #[test] fn test_plain_decode_float() { let data = vec![3.14, 2.414, 12.51]; @@ -1169,13 +1190,7 @@ mod tests { fn test_plain_skip_all_float() { let data = vec![3.14, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 4, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 4, -1, &[]); } #[test] @@ -1195,13 +1210,7 @@ mod tests { fn test_plain_skip_all_double() { let data = vec![3.14f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 5, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1261,13 +1270,7 @@ mod tests { data[2].set_data(10, 20, 30); data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 4, - 8, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 4, 8, -1, &[]); } #[test] @@ -1307,16 +1310,9 @@ mod tests { false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 10, - 20, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 10, 20, -1, &[]); } - #[test] fn test_plain_decode_byte_array() { let mut data = vec![ByteArray::new(); 2]; @@ -1354,13 +1350,7 @@ mod tests { data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); let data_bytes = ByteArrayType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 2, - 2, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 2, 2, -1, &[]); } #[test] @@ -1587,7 +1577,6 @@ mod tests { ]; test_skip::(block_data.clone(), Encoding::DELTA_BINARY_PACKED, 5); test_skip::(block_data, Encoding::DELTA_BINARY_PACKED, 100); - } #[test] @@ -1833,8 +1822,7 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = - get_encoder::(col_descr.clone(), encoding).expect("get encoder"); + let mut encoder = get_encoder::(encoding).expect("get encoder"); for v in &data[..] { encoder.put(&v[..]).expect("ok to encode"); @@ -1867,17 +1855,14 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = - get_encoder::(col_descr.clone(), encoding).expect("get encoder"); + let mut encoder = get_encoder::(encoding).expect("get encoder"); encoder.put(&data).expect("ok to encode"); let bytes = encoder.flush_buffer().expect("ok to flush buffer"); let mut decoder = get_decoder::(col_descr, encoding).expect("get decoder"); - decoder - .set_data(bytes, data.len()) - .expect("ok to set data"); + decoder.set_data(bytes, data.len()).expect("ok to set data"); if skip >= data.len() { let skipped = decoder.skip(skip).expect("ok to skip"); @@ -1894,7 +1879,7 @@ mod tests { let expected = &data[skip..]; let mut buffer = vec![T::T::default(); remaining]; let fetched = decoder.get(&mut buffer).expect("ok to decode"); - assert_eq!(remaining,fetched); + assert_eq!(remaining, fetched); assert_eq!(&buffer, expected); } } @@ -1966,7 +1951,7 @@ mod tests { v.push(0); } if *item { - set_array_bit(&mut v[..], i); + v[i / 8] |= 1 << (i % 8); } } v diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index a7855cc84606..18deba65e687 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -73,9 +73,6 @@ impl Storage for KeyStorage { /// (max bit width = 32), followed by the values encoded using RLE/Bit packed described /// above (with the given bit width). pub struct DictEncoder { - /// Descriptor for the column to be encoded. - desc: ColumnDescPtr, - interner: Interner>, /// The buffered indices @@ -92,7 +89,6 @@ impl DictEncoder { }; Self { - desc, interner: Interner::new(storage), indices: vec![], } @@ -117,7 +113,7 @@ impl DictEncoder { /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return /// the result. pub fn write_dict(&self) -> Result { - let mut plain_encoder = PlainEncoder::::new(self.desc.clone(), vec![]); + let mut plain_encoder = PlainEncoder::::new(); plain_encoder.put(&self.interner.storage().uniques)?; plain_encoder.flush_buffer() } diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index 2c6bd9b371a0..050f1b9f8a63 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -24,7 +24,6 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; use crate::util::{ bit_util::{self, num_required_bits, BitWriter}, memory::ByteBufferPtr, @@ -76,12 +75,9 @@ pub trait Encoder { /// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage /// for the encoder instance is tracked by `mem_tracker`. -pub fn get_encoder( - desc: ColumnDescPtr, - encoding: Encoding, -) -> Result>> { +pub fn get_encoder(encoding: Encoding) -> Result>> { let encoder: Box> = match encoding { - Encoding::PLAIN => Box::new(PlainEncoder::new(desc, vec![])), + Encoding::PLAIN => Box::new(PlainEncoder::new()), Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { return Err(general_err!( "Cannot initialize this encoding through this function" @@ -113,17 +109,21 @@ pub fn get_encoder( pub struct PlainEncoder { buffer: Vec, bit_writer: BitWriter, - desc: ColumnDescPtr, _phantom: PhantomData, } +impl Default for PlainEncoder { + fn default() -> Self { + Self::new() + } +} + impl PlainEncoder { /// Creates new plain encoder. - pub fn new(desc: ColumnDescPtr, buffer: Vec) -> Self { + pub fn new() -> Self { Self { - buffer, + buffer: vec![], bit_writer: BitWriter::new(256), - desc, _phantom: PhantomData, } } @@ -171,6 +171,12 @@ pub struct RleValueEncoder { _phantom: PhantomData, } +impl Default for RleValueEncoder { + fn default() -> Self { + Self::new() + } +} + impl RleValueEncoder { /// Creates new rle value encoder. pub fn new() -> Self { @@ -280,6 +286,12 @@ pub struct DeltaBitPackEncoder { _phantom: PhantomData, } +impl Default for DeltaBitPackEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaBitPackEncoder { /// Creates new delta bit packed encoder. pub fn new() -> Self { @@ -531,6 +543,12 @@ pub struct DeltaLengthByteArrayEncoder { _phantom: PhantomData, } +impl Default for DeltaLengthByteArrayEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaLengthByteArrayEncoder { /// Creates new delta length byte array encoder. pub fn new() -> Self { @@ -610,6 +628,12 @@ pub struct DeltaByteArrayEncoder { _phantom: PhantomData, } +impl Default for DeltaByteArrayEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaByteArrayEncoder { /// Creates new delta byte array encoder. pub fn new() -> Self { @@ -705,7 +729,7 @@ mod tests { use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; - use crate::util::test_common::{random_bytes, RandGen}; + use crate::util::test_common::rand_gen::{random_bytes, RandGen}; const TEST_SET_SIZE: usize = 1024; @@ -847,7 +871,7 @@ mod tests { Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { Box::new(create_test_dict_encoder::(type_length)) } - _ => create_test_encoder::(type_length, encoding), + _ => create_test_encoder::(encoding), }; assert_eq!(encoder.estimated_data_encoded_size(), initial_size); @@ -900,7 +924,7 @@ mod tests { #[test] fn test_issue_47() { let mut encoder = - create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); + create_test_encoder::(Encoding::DELTA_BYTE_ARRAY); let mut decoder = create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); @@ -952,7 +976,7 @@ mod tests { impl> EncodingTester for T { fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()> { - let mut encoder = create_test_encoder::(type_length, enc); + let mut encoder = create_test_encoder::(enc); let mut decoder = create_test_decoder::(type_length, enc); let mut values = >::gen_vec(type_length, total); let mut result_data = vec![T::T::default(); total]; @@ -1054,8 +1078,7 @@ mod tests { encoding: Encoding, err: Option, ) { - let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); - let encoder = get_encoder::(descr, encoding); + let encoder = get_encoder::(encoding); match err { Some(parquet_error) => { assert!(encoder.is_err()); @@ -1082,12 +1105,8 @@ mod tests { )) } - fn create_test_encoder( - type_len: i32, - enc: Encoding, - ) -> Box> { - let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); - get_encoder(desc, enc).unwrap() + fn create_test_encoder(enc: Encoding) -> Box> { + get_encoder(enc).unwrap() } fn create_test_decoder( diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index 62c68d843c71..40af135ff28d 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -142,12 +142,14 @@ impl LevelEncoder { /// Decoder for definition/repetition levels. /// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and /// RLE for Data Page v2. +#[allow(unused)] pub enum LevelDecoder { Rle(Option, RleDecoder), RleV2(Option, RleDecoder), BitPacked(Option, u8, BitReader), } +#[allow(unused)] impl LevelDecoder { /// Creates new level decoder based on encoding and max definition/repetition level. /// This method only initializes level decoder, `set_data` method must be called @@ -274,7 +276,7 @@ impl LevelDecoder { mod tests { use super::*; - use crate::util::test_common::random_numbers_range; + use crate::util::test_common::rand_gen::random_numbers_range; fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { let mut encoder = if v2 { diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index aad833e0eee3..39a0aa4d03da 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -45,7 +45,6 @@ use crate::util::{ /// Maximum groups per bit-packed run. Current value is 64. const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8; -const MAX_WRITER_BUF_SIZE: usize = 1 << 10; /// A RLE/Bit-Packing hybrid encoder. // TODO: tracking memory usage @@ -56,9 +55,6 @@ pub struct RleEncoder { // Underlying writer which holds an internal buffer. bit_writer: BitWriter, - // The maximum byte size a single run can take. - max_run_byte_size: usize, - // Buffered values for bit-packed runs. buffered_values: [u64; 8], @@ -82,6 +78,7 @@ pub struct RleEncoder { } impl RleEncoder { + #[allow(unused)] pub fn new(bit_width: u8, buffer_len: usize) -> Self { let buffer = Vec::with_capacity(buffer_len); RleEncoder::new_from_buf(bit_width, buffer) @@ -89,12 +86,10 @@ impl RleEncoder { /// Initialize the encoder from existing `buffer` pub fn new_from_buf(bit_width: u8, buffer: Vec) -> Self { - let max_run_byte_size = RleEncoder::min_buffer_size(bit_width); let bit_writer = BitWriter::new_from_buf(buffer); RleEncoder { bit_width, bit_writer, - max_run_byte_size, buffered_values: [0; 8], num_buffered_values: 0, current_value: 0, @@ -162,6 +157,7 @@ impl RleEncoder { } #[inline] + #[allow(unused)] pub fn buffer(&self) -> &[u8] { self.bit_writer.buffer() } @@ -171,6 +167,7 @@ impl RleEncoder { self.bit_writer.bytes_written() } + #[allow(unused)] pub fn is_empty(&self) -> bool { self.bit_writer.bytes_written() == 0 } @@ -184,6 +181,7 @@ impl RleEncoder { /// Borrow equivalent of the `consume` method. /// Call `clear()` after invoking this method. #[inline] + #[allow(unused)] pub fn flush_buffer(&mut self) -> &[u8] { self.flush(); self.bit_writer.flush_buffer() @@ -192,6 +190,7 @@ impl RleEncoder { /// Clears the internal state so this encoder can be reused (e.g., after becoming /// full). #[inline] + #[allow(unused)] pub fn clear(&mut self) { self.bit_writer.clear(); self.num_buffered_values = 0; diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index c2fb5bd66cf9..68378419330e 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -148,8 +148,8 @@ macro_rules! arrow_err { // Convert parquet error into other errors #[cfg(any(feature = "arrow", test))] -impl Into for ParquetError { - fn into(self) -> ArrowError { - ArrowError::ParquetError(format!("{}", self)) +impl From for ArrowError { + fn from(p: ParquetError) -> Self { + Self::ParquetError(format!("{}", p)) } } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 58eaf7a8c875..018dd95d9f35 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -834,6 +834,12 @@ pub struct ColumnIndexBuilder { valid: bool, } +impl Default for ColumnIndexBuilder { + fn default() -> Self { + Self::new() + } +} + impl ColumnIndexBuilder { pub fn new() -> Self { ColumnIndexBuilder { @@ -887,6 +893,12 @@ pub struct OffsetIndexBuilder { current_first_row_index: i64, } +impl Default for OffsetIndexBuilder { + fn default() -> Self { + Self::new() + } +} + impl OffsetIndexBuilder { pub fn new() -> Self { OffsetIndexBuilder { diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 45381234c027..f29b80accae2 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -47,6 +47,7 @@ impl PageIndex { } #[derive(Debug, Clone, PartialEq)] +#[allow(non_camel_case_types)] pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index fc87ef20448f..bb7808f16487 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -17,4 +17,6 @@ pub mod index; pub mod index_reader; + +#[cfg(test)] pub(crate) mod range; diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs index 06c06553ccd5..e9741ec8e7fd 100644 --- a/parquet/src/file/page_index/range.rs +++ b/parquet/src/file/page_index/range.rs @@ -213,6 +213,7 @@ impl RowRanges { result } + #[allow(unused)] pub fn row_count(&self) -> usize { self.ranges.iter().map(|x| x.count()).sum() } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 9ca7c4daa597..c96439820262 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -69,6 +69,7 @@ const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); /// /// Basic constant, which is not part of the Thrift definition. #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(non_camel_case_types)] pub enum WriterVersion { PARQUET_1_0, PARQUET_2_0, @@ -360,7 +361,7 @@ impl WriterPropertiesBuilder { fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { self.column_properties .entry(col) - .or_insert(ColumnProperties::new()) + .or_insert_with(ColumnProperties::new) } /// Sets encoding for a column. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index e0053225d249..045410d03809 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -141,6 +141,7 @@ pub struct SerializedFileReader { /// A builder for [`ReadOptions`]. /// For the predicates that are added to the builder, /// they will be chained using 'AND' to filter the row groups. +#[derive(Default)] pub struct ReadOptionsBuilder { predicates: Vec bool>>, enable_page_index: bool, @@ -149,10 +150,7 @@ pub struct ReadOptionsBuilder { impl ReadOptionsBuilder { /// New builder pub fn new() -> Self { - ReadOptionsBuilder { - predicates: vec![], - enable_page_index: false, - } + Self::default() } /// Add a predicate on row group metadata to the reading option, @@ -692,7 +690,7 @@ mod tests { use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; use crate::util::bit_util::from_le_slice; - use crate::util::test_common::{get_test_file, get_test_path}; + use crate::util::test_common::file_util::{get_test_file, get_test_path}; use parquet_format::BoundaryOrder; use std::sync::Arc; diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 5ee43f8ad6fb..90fe399e78d7 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -33,14 +33,6 @@ //! //! 3. [arrow::async_reader] for `async` reading and writing parquet //! files to Arrow `RecordBatch`es (requires the `async` feature). -#![allow(dead_code)] -#![allow(non_camel_case_types)] -#![allow( - clippy::from_over_into, - clippy::new_without_default, - clippy::or_fun_call, - clippy::too_many_arguments -)] /// Defines a an item with an experimental public API /// diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 05b63661f09b..0b7e04587354 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -40,6 +40,12 @@ pub struct TreeBuilder { batch_size: usize, } +impl Default for TreeBuilder { + fn default() -> Self { + Self::new() + } +} + impl TreeBuilder { /// Creates new tree builder with default parameters. pub fn new() -> Self { @@ -822,7 +828,7 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; use crate::schema::parser::parse_message_type; - use crate::util::test_common::{get_test_file, get_test_path}; + use crate::util::test_common::file_util::{get_test_file, get_test_path}; use std::convert::TryFrom; // Convenient macros to assemble row, list, map, and group. diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index de566a122e20..5a7e2a0ca74e 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -363,7 +363,7 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::schema::types::ColumnPath; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; #[test] #[should_panic(expected = "Expected positive batch size, found: 0")] diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 1dec9b03f082..c70384e7aa2f 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -101,38 +101,6 @@ macro_rules! read_num_bytes { }}; } -/// Converts value `val` of type `T` to a byte vector, by reading `num_bytes` from `val`. -/// NOTE: if `val` is less than the size of `T` then it can be truncated. -#[inline] -pub fn convert_to_bytes(val: &T, num_bytes: usize) -> Vec -where - T: ?Sized + AsBytes, -{ - let mut bytes: Vec = vec![0; num_bytes]; - memcpy_value(val.as_bytes(), num_bytes, &mut bytes); - bytes -} - -#[inline] -pub fn memcpy(source: &[u8], target: &mut [u8]) { - assert!(target.len() >= source.len()); - target[..source.len()].copy_from_slice(source) -} - -#[inline] -pub fn memcpy_value(source: &T, num_bytes: usize, target: &mut [u8]) -where - T: ?Sized + AsBytes, -{ - assert!( - target.len() >= num_bytes, - "Not enough space. Only had {} bytes but need to put {} bytes", - target.len(), - num_bytes - ); - memcpy(&source.as_bytes()[..num_bytes], target) -} - /// Returns the ceil of value/divisor. /// /// This function should be removed after @@ -152,16 +120,6 @@ pub fn trailing_bits(v: u64, num_bits: usize) -> u64 { } } -#[inline] -pub fn set_array_bit(bits: &mut [u8], i: usize) { - bits[i / 8] |= 1 << (i % 8); -} - -#[inline] -pub fn unset_array_bit(bits: &mut [u8], i: usize) { - bits[i / 8] &= !(1 << (i % 8)); -} - /// Returns the minimum number of bits needed to represent the value 'x' #[inline] pub fn num_required_bits(x: u64) -> u8 { @@ -728,20 +686,11 @@ impl From> for BitReader { } } -/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must -/// be a power of 2. -/// -/// Copied from the arrow crate to make arrow optional -pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize { - debug_assert!(factor > 0 && (factor & (factor - 1)) == 0); - (num + (factor - 1)) & !(factor - 1) -} - #[cfg(test)] mod tests { - use super::super::test_common::*; use super::*; + use crate::util::test_common::rand_gen::random_numbers; use rand::distributions::{Distribution, Standard}; use std::fmt::Debug; @@ -874,25 +823,6 @@ mod tests { assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-2)); } - #[test] - fn test_set_array_bit() { - let mut buffer = vec![0, 0, 0]; - set_array_bit(&mut buffer[..], 1); - assert_eq!(buffer, vec![2, 0, 0]); - set_array_bit(&mut buffer[..], 4); - assert_eq!(buffer, vec![18, 0, 0]); - unset_array_bit(&mut buffer[..], 1); - assert_eq!(buffer, vec![16, 0, 0]); - set_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 4, 0]); - set_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 4, 0]); - set_array_bit(&mut buffer[..], 11); - assert_eq!(buffer, vec![16, 12, 0]); - unset_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 8, 0]); - } - #[test] fn test_num_required_bits() { assert_eq!(num_required_bits(0), 0); diff --git a/parquet/src/util/io.rs b/parquet/src/util/io.rs index 016d45075ee9..1fb92063e27c 100644 --- a/parquet/src/util/io.rs +++ b/parquet/src/util/io.rs @@ -167,7 +167,7 @@ mod tests { use std::iter; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; #[test] fn test_io_read_fully() { diff --git a/parquet/src/util/test_common/mod.rs b/parquet/src/util/test_common/mod.rs index f0beb16ca954..504219ecae19 100644 --- a/parquet/src/util/test_common/mod.rs +++ b/parquet/src/util/test_common/mod.rs @@ -15,17 +15,10 @@ // specific language governing permissions and limitations // under the License. -pub mod file_util; pub mod page_util; -pub mod rand_gen; - -pub use self::rand_gen::random_bools; -pub use self::rand_gen::random_bytes; -pub use self::rand_gen::random_numbers; -pub use self::rand_gen::random_numbers_range; -pub use self::rand_gen::RandGen; -pub use self::file_util::get_test_file; -pub use self::file_util::get_test_path; +#[cfg(test)] +pub mod file_util; -pub use self::page_util::make_pages; +#[cfg(test)] +pub mod rand_gen; \ No newline at end of file diff --git a/parquet/src/util/test_common/page_util.rs b/parquet/src/util/test_common/page_util.rs index bc197d00e00d..dffcb2a44e87 100644 --- a/parquet/src/util/test_common/page_util.rs +++ b/parquet/src/util/test_common/page_util.rs @@ -19,14 +19,11 @@ use crate::basic::Encoding; use crate::column::page::{Page, PageIterator}; use crate::column::page::{PageMetadata, PageReader}; use crate::data_type::DataType; -use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; +use crate::encodings::encoding::{get_encoder, Encoder}; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; -use crate::util::test_common::random_numbers_range; -use rand::distributions::uniform::SampleUniform; -use std::collections::VecDeque; use std::mem; pub trait DataPageBuilder { @@ -44,7 +41,6 @@ pub trait DataPageBuilder { /// - consume() /// in order to populate and obtain a data page. pub struct DataPageBuilderImpl { - desc: ColumnDescPtr, encoding: Option, num_values: u32, buffer: Vec, @@ -57,9 +53,8 @@ impl DataPageBuilderImpl { // `num_values` is the number of non-null values to put in the data page. // `datapage_v2` flag is used to indicate if the generated data page should use V2 // format or not. - pub fn new(desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { + pub fn new(_desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { DataPageBuilderImpl { - desc, encoding: None, num_values, buffer: vec![], @@ -111,8 +106,7 @@ impl DataPageBuilder for DataPageBuilderImpl { ); self.encoding = Some(encoding); let mut encoder: Box> = - get_encoder::(self.desc.clone(), encoding) - .expect("get_encoder() should be OK"); + get_encoder::(encoding).expect("get_encoder() should be OK"); encoder.put(values).expect("put() should be OK"); let encoded_values = encoder .flush_buffer() @@ -229,88 +223,3 @@ impl> + Send> PageIterator for InMemoryPageIterator Ok(self.column_desc.clone()) } } - -pub fn make_pages( - desc: ColumnDescPtr, - encoding: Encoding, - num_pages: usize, - levels_per_page: usize, - min: T::T, - max: T::T, - def_levels: &mut Vec, - rep_levels: &mut Vec, - values: &mut Vec, - pages: &mut VecDeque, - use_v2: bool, -) where - T::T: PartialOrd + SampleUniform + Copy, -{ - let mut num_values = 0; - let max_def_level = desc.max_def_level(); - let max_rep_level = desc.max_rep_level(); - - let mut dict_encoder = DictEncoder::::new(desc.clone()); - - for i in 0..num_pages { - let mut num_values_cur_page = 0; - let level_range = i * levels_per_page..(i + 1) * levels_per_page; - - if max_def_level > 0 { - random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); - for dl in &def_levels[level_range.clone()] { - if *dl == max_def_level { - num_values_cur_page += 1; - } - } - } else { - num_values_cur_page = levels_per_page; - } - if max_rep_level > 0 { - random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); - } - random_numbers_range(num_values_cur_page, min, max, values); - - // Generate the current page - - let mut pb = - DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); - if max_rep_level > 0 { - pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); - } - if max_def_level > 0 { - pb.add_def_levels(max_def_level, &def_levels[level_range]); - } - - let value_range = num_values..num_values + num_values_cur_page; - match encoding { - Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { - let _ = dict_encoder.put(&values[value_range.clone()]); - let indices = dict_encoder - .write_indices() - .expect("write_indices() should be OK"); - pb.add_indices(indices); - } - Encoding::PLAIN => { - pb.add_values::(encoding, &values[value_range]); - } - enc => panic!("Unexpected encoding {}", enc), - } - - let data_page = pb.consume(); - pages.push_back(data_page); - num_values += num_values_cur_page; - } - - if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { - let dict = dict_encoder - .write_dict() - .expect("write_dict() should be OK"); - let dict_page = Page::DictionaryPage { - buf: dict, - num_values: dict_encoder.num_entries() as u32, - encoding: Encoding::RLE_DICTIONARY, - is_sorted: false, - }; - pages.push_front(dict_page); - } -} diff --git a/parquet/src/util/test_common/rand_gen.rs b/parquet/src/util/test_common/rand_gen.rs index d9c256577684..4e54aa7999cf 100644 --- a/parquet/src/util/test_common/rand_gen.rs +++ b/parquet/src/util/test_common/rand_gen.rs @@ -15,13 +15,19 @@ // specific language governing permissions and limitations // under the License. +use crate::basic::Encoding; +use crate::column::page::Page; use rand::{ distributions::{uniform::SampleUniform, Distribution, Standard}, thread_rng, Rng, }; +use std::collections::VecDeque; use crate::data_type::*; +use crate::encodings::encoding::{DictEncoder, Encoder}; +use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; +use crate::util::{DataPageBuilder, DataPageBuilderImpl}; /// Random generator of data type `T` values and sequences. pub trait RandGen { @@ -106,15 +112,6 @@ pub fn random_bytes(n: usize) -> Vec { result } -pub fn random_bools(n: usize) -> Vec { - let mut result = vec![]; - let mut rng = thread_rng(); - for _ in 0..n { - result.push(rng.gen::()); - } - result -} - pub fn random_numbers(n: usize) -> Vec where Standard: Distribution, @@ -132,3 +129,89 @@ where result.push(rng.gen_range(low..high)); } } + +#[allow(clippy::too_many_arguments)] +pub fn make_pages( + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + levels_per_page: usize, + min: T::T, + max: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + pages: &mut VecDeque, + use_v2: bool, +) where + T::T: PartialOrd + SampleUniform + Copy, +{ + let mut num_values = 0; + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + + let mut dict_encoder = DictEncoder::::new(desc.clone()); + + for i in 0..num_pages { + let mut num_values_cur_page = 0; + let level_range = i * levels_per_page..(i + 1) * levels_per_page; + + if max_def_level > 0 { + random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); + for dl in &def_levels[level_range.clone()] { + if *dl == max_def_level { + num_values_cur_page += 1; + } + } + } else { + num_values_cur_page = levels_per_page; + } + if max_rep_level > 0 { + random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); + } + random_numbers_range(num_values_cur_page, min, max, values); + + // Generate the current page + + let mut pb = + DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + if max_rep_level > 0 { + pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); + } + if max_def_level > 0 { + pb.add_def_levels(max_def_level, &def_levels[level_range]); + } + + let value_range = num_values..num_values + num_values_cur_page; + match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + let _ = dict_encoder.put(&values[value_range.clone()]); + let indices = dict_encoder + .write_indices() + .expect("write_indices() should be OK"); + pb.add_indices(indices); + } + Encoding::PLAIN => { + pb.add_values::(encoding, &values[value_range]); + } + enc => panic!("Unexpected encoding {}", enc), + } + + let data_page = pb.consume(); + pages.push_back(data_page); + num_values += num_values_cur_page; + } + + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + let dict = dict_encoder + .write_dict() + .expect("write_dict() should be OK"); + let dict_page = Page::DictionaryPage { + buf: dict, + num_values: dict_encoder.num_entries() as u32, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }; + pages.push_front(dict_page); + } +}