Skip to content

Commit

Permalink
fix: Handle Parquet projection pushdown with only row index (#18520)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Sep 2, 2024
1 parent 0b65d88 commit f5bb65c
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 0 deletions.
15 changes: 15 additions & 0 deletions crates/polars-io/src/parquet/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::ops::{Deref, Range};
use arrow::array::BooleanArray;
use arrow::bitmap::MutableBitmap;
use arrow::datatypes::ArrowSchemaRef;
use polars_core::chunked_array::builder::NullChunkedBuilder;
use polars_core::prelude::*;
use polars_core::utils::{accumulate_dataframes_vertical, split_df};
use polars_core::POOL;
Expand Down Expand Up @@ -151,6 +152,20 @@ fn rg_to_dfs(
use_statistics: bool,
hive_partition_columns: Option<&[Series]>,
) -> PolarsResult<Vec<DataFrame>> {
// If we are only interested in the row_index, we take a little special path here.
if projection.is_empty() {
if let Some(row_index) = row_index {
let placeholder =
NullChunkedBuilder::new(PlSmallStr::from_static("__PL_TMP"), slice.1).finish();
return Ok(vec![DataFrame::new(vec![placeholder.into_series()])?
.with_row_index(
row_index.name.clone(),
Some(row_index.offset + IdxSize::try_from(slice.0).unwrap()),
)?
.select(std::iter::once(row_index.name))?]);
}
}

use ParallelStrategy as S;

if parallel == S::Prefiltered {
Expand Down
35 changes: 35 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1868,3 +1868,38 @@ def test_empty_parquet() -> None:

empty_from_pl = pl.read_parquet(f_pl)
assert empty_from_pl.shape == (0, 0)


@pytest.mark.parametrize(
"strategy",
["columns", "row_groups", "prefiltered"],
)
@pytest.mark.write_disk
def test_row_index_projection_pushdown_18463(
tmp_path: Path, strategy: pl.ParallelStrategy
) -> None:
tmp_path.mkdir(exist_ok=True)
f = tmp_path / "test.parquet"

pl.DataFrame({"A": [1, 4], "B": [2, 5]}).write_parquet(f)

df = pl.scan_parquet(f, parallel=strategy).with_row_index()

assert_frame_equal(df.select("index").collect(), df.collect().select("index"))

df = pl.scan_parquet(f, parallel=strategy).with_row_index("other_idx_name")

assert_frame_equal(
df.select("other_idx_name").collect(), df.collect().select("other_idx_name")
)

df = pl.scan_parquet(f, parallel=strategy).with_row_index(offset=42)

assert_frame_equal(df.select("index").collect(), df.collect().select("index"))

df = pl.scan_parquet(f, parallel=strategy).with_row_index()

assert_frame_equal(
df.select("index").slice(1, 1).collect(),
df.collect().select("index").slice(1, 1),
)

0 comments on commit f5bb65c

Please sign in to comment.