Skip to content

Commit

Permalink
Make Parquet reader filter APIs public (#1792) (#2467)
Browse files Browse the repository at this point in the history
* Make filter APIs public (#1792)

* Update parquet/src/arrow/arrow_reader/mod.rs

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
  • Loading branch information
tustvold and viirya authored Aug 17, 2022
1 parent 0013170 commit 6d0ea90
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 19 deletions.
28 changes: 11 additions & 17 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,11 @@ use crate::file::reader::{ChunkReader, FileReader, SerializedFileReader};
use crate::file::serialized_reader::ReadOptionsBuilder;
use crate::schema::types::SchemaDescriptor;

#[allow(unused)]
mod filter;
#[allow(unused)]
mod selection;

// TODO: Make these public once stable (#1792)
#[allow(unused_imports)]
pub(crate) use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
#[allow(unused_imports)]
pub(crate) use selection::{RowSelection, RowSelector};
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
pub use selection::{RowSelection, RowSelector};

/// A generic builder for constructing sync or async arrow parquet readers. This is not intended
/// to be used directly, instead you should use the specialization for the type of reader
Expand Down Expand Up @@ -140,15 +135,17 @@ impl<T> ArrowReaderBuilder<T> {
}
}

/// Provide a [`RowSelection] to filter out rows, and avoid fetching their
/// data into memory
/// Provide a [`RowSelection`] to filter out rows, and avoid fetching their
/// data into memory.
///
/// Row group filtering is applied prior to this, and rows from skipped
/// Row group filtering is applied prior to this, and therefore rows from skipped
/// row groups should not be included in the [`RowSelection`]
///
/// TODO: Make public once stable (#1792)
#[allow(unused)]
pub(crate) fn with_row_selection(self, selection: RowSelection) -> Self {
/// An example use case of this would be applying a selection determined by
/// evaluating predicates against the [`Index`]
///
/// [`Index`]: [parquet::file::page_index::index::Index]
pub fn with_row_selection(self, selection: RowSelection) -> Self {
Self {
selection: Some(selection),
..self
Expand All @@ -158,10 +155,7 @@ impl<T> ArrowReaderBuilder<T> {
/// Provide a [`RowFilter`] to skip decoding rows
///
/// Row filters are applied after row group selection and row selection
///
/// TODO: Make public once stable (#1792)
#[allow(unused)]
pub(crate) fn with_row_filter(self, filter: RowFilter) -> Self {
pub fn with_row_filter(self, filter: RowFilter) -> Self {
Self {
filter: Some(filter),
..self
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/arrow/arrow_reader/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -451,11 +451,11 @@ mod tests {
let mut rand = thread_rng();
for _ in 0..100 {
let a_len = rand.gen_range(10..100);
let a_bools: Vec<_> = (0..a_len).map(|x| rand.gen_bool(0.2)).collect();
let a_bools: Vec<_> = (0..a_len).map(|_| rand.gen_bool(0.2)).collect();
let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]);

let b_len: usize = a_bools.iter().map(|x| *x as usize).sum();
let b_bools: Vec<_> = (0..b_len).map(|x| rand.gen_bool(0.8)).collect();
let b_bools: Vec<_> = (0..b_len).map(|_| rand.gen_bool(0.8)).collect();
let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]);

let mut expected_bools = vec![false; a_len];
Expand Down

0 comments on commit 6d0ea90

Please sign in to comment.