From 6e88f1dac92c312641f08a4874589ac2be15a7d9 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 31 May 2024 13:14:37 +0200 Subject: [PATCH] fix: ensure df in empty parquet (#16621) --- crates/polars-io/src/parquet/read/read_impl.rs | 5 ++++- py-polars/tests/unit/streaming/test_streaming_io.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 0226e0679ba2..d461d54c95c8 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -779,10 +779,13 @@ impl BatchedParquetReader { self.chunks_fifo.push_back(df) } } + } else { + skipped_all_rgs = !self.has_returned; }; if self.chunks_fifo.is_empty() { if skipped_all_rgs { + self.has_returned = true; Ok(Some(vec![materialize_empty_df( Some(self.projection.as_ref()), &self.schema, @@ -803,7 +806,7 @@ impl BatchedParquetReader { } } - self.has_returned |= true; + self.has_returned = true; Ok(Some(chunks)) } } diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index 982ba225e9d9..83fa75a4b4a8 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -268,3 +268,13 @@ def test_parquet_eq_statistics(monkeypatch: Any, capfd: Any, tmp_path: Path) -> "parquet file can be skipped, the statistics were sufficient" " to apply the predicate." in captured ) + + +@pytest.mark.write_disk() +def test_streaming_empty_parquet_16523(tmp_path: Path) -> None: + file_path = tmp_path / "foo.parquet" + df = pl.DataFrame({"a": []}, schema={"a": pl.Int32}) + df.write_parquet(file_path) + q = pl.scan_parquet(file_path) + q2 = pl.LazyFrame({"a": [1]}, schema={"a": pl.Int32}) + assert q.join(q2, on="a").collect(streaming=True).shape == (0, 1)