-
Notifications
You must be signed in to change notification settings - Fork 750
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
parquet bloom filter part II: read sbbf bitset from row group reader,…
… update API, and add cli demo (#3102) * add feature flag * add api * fix reading with chunk reader * refactor * add a binary to demo * add bin * remove unused * fix clippy * adjust byte size * update read method * parquet-show-bloom-filter with bloom feature required * remove extern crate * get rid of loop read * refactor to test * rework api * remove unused trait * update help
- Loading branch information
Showing
9 changed files
with
235 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Binary file to read bloom filter data from a Parquet file. | ||
//! | ||
//! # Install | ||
//! | ||
//! `parquet-show-bloom-filter` can be installed using `cargo`: | ||
//! ``` | ||
//! cargo install parquet --features=cli | ||
//! ``` | ||
//! After this `parquet-show-bloom-filter` should be available: | ||
//! ``` | ||
//! parquet-show-bloom-filter --file-name XYZ.parquet --column id --values a | ||
//! ``` | ||
//! | ||
//! The binary can also be built from the source code and run as follows: | ||
//! ``` | ||
//! cargo run --features=cli --bin parquet-show-bloom-filter -- --file-name XYZ.parquet --column id --values a | ||
//! ``` | ||
|
||
use clap::Parser; | ||
use parquet::file::reader::{FileReader, SerializedFileReader}; | ||
use std::{fs::File, path::Path}; | ||
|
||
#[derive(Debug, Parser)] | ||
#[clap(author, version, about("Binary file to read bloom filter data from a Parquet file"), long_about = None)] | ||
struct Args { | ||
#[clap(short, long, help("Path to the parquet file"))] | ||
file_name: String, | ||
#[clap( | ||
short, | ||
long, | ||
help("Check the bloom filter indexes for the given column") | ||
)] | ||
column: String, | ||
#[clap( | ||
short, | ||
long, | ||
help("Check if the given values match bloom filter, the values will be evaluated as strings"), | ||
required = true | ||
)] | ||
values: Vec<String>, | ||
} | ||
|
||
fn main() { | ||
let args = Args::parse(); | ||
let file_name = args.file_name; | ||
let path = Path::new(&file_name); | ||
let file = File::open(path).expect("Unable to open file"); | ||
|
||
let file_reader = | ||
SerializedFileReader::new(file).expect("Unable to open file as Parquet"); | ||
let metadata = file_reader.metadata(); | ||
for (ri, row_group) in metadata.row_groups().iter().enumerate() { | ||
println!("Row group #{}", ri); | ||
println!("{}", "=".repeat(80)); | ||
if let Some((column_index, _)) = row_group | ||
.columns() | ||
.iter() | ||
.enumerate() | ||
.find(|(_, column)| column.column_path().string() == args.column) | ||
{ | ||
let row_group_reader = file_reader | ||
.get_row_group(ri) | ||
.expect("Unable to read row group"); | ||
if let Some(sbbf) = row_group_reader | ||
.get_column_bloom_filter(column_index) | ||
.expect("Failed to parse bloom filter") | ||
{ | ||
args.values.iter().for_each(|value| { | ||
println!( | ||
"Value {} is {} in bloom filter", | ||
value, | ||
if sbbf.check(value.as_str()) { | ||
"present" | ||
} else { | ||
"absent" | ||
} | ||
) | ||
}); | ||
} | ||
} else { | ||
println!( | ||
"No column named {} found, candidate columns are: {}", | ||
args.column, | ||
row_group | ||
.columns() | ||
.iter() | ||
.map(|c| c.column_path().string()) | ||
.collect::<Vec<_>>() | ||
.join(", ") | ||
); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.