Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

depr(python): Deprecate LazyFrame.fetch #17278

Merged
merged 2 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Miscellaneous
LazyFrame.collect
LazyFrame.collect_async
LazyFrame.collect_schema
LazyFrame.fetch
alexander-beedie marked this conversation as resolved.
Show resolved Hide resolved
LazyFrame.lazy
LazyFrame.map_batches
LazyFrame.pipe
Expand Down
10 changes: 6 additions & 4 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2378,8 +2378,10 @@ def to_init_repr(self, n: int = 1000) -> str:
def serialize(
self, file: None = ..., *, format: Literal["binary"] = ...
) -> bytes: ...

@overload
def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...

@overload
def serialize(
self, file: IOBase | str | Path, *, format: SerializationFormat = ...
Expand Down Expand Up @@ -8451,8 +8453,6 @@ def lazy(self) -> LazyFrame:
Operations on a `LazyFrame` are not executed until this is requested by either
calling:

* :meth:`.fetch() <polars.LazyFrame.fetch>`
(run on a small number of rows)
* :meth:`.collect() <polars.LazyFrame.collect>`
(run on all data)
* :meth:`.describe_plan() <polars.LazyFrame.describe_plan>`
Expand All @@ -8461,9 +8461,11 @@ def lazy(self) -> LazyFrame:
(print optimized query plan)
* :meth:`.show_graph() <polars.LazyFrame.show_graph>`
(show (un)optimized query plan as graphviz graph)
* :meth:`.collect_schema() <polars.LazyFrame.collect_schema>`
(return the final frame schema)

Lazy operations are advised because they allow for query optimization and more
parallelization.
Lazy operations are recommended because they allow for query optimization and
additional parallelism.

Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/io/csv/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,7 @@ def scan_csv(
... .filter(
... pl.col("a") > 10
... ) # the filter is pushed down the scan, so less data is read into memory
... .fetch(100) # pushed a limit of 100 rows to the scan level
... .head(100) # constrain number of returned results to 100
... ) # doctest: +SKIP

We can use `with_column_names` to modify the header before scanning:
Expand Down
57 changes: 56 additions & 1 deletion py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,8 +651,10 @@ def _repr_html_(self) -> str:
def serialize(
self, file: None = ..., *, format: Literal["binary"] = ...
) -> bytes: ...

@overload
def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...

@overload
def serialize(
self, file: IOBase | str | Path, *, format: SerializationFormat = ...
Expand Down Expand Up @@ -2581,6 +2583,10 @@ def _set_sink_optimizations(
new_streaming=False,
)

@deprecate_function(
"This will shortly become a private function; use is strongly discouraged",
alexander-beedie marked this conversation as resolved.
Show resolved Hide resolved
version="1.0",
)
def fetch(
self,
n_rows: int = 500,
Expand All @@ -2599,6 +2605,55 @@ def fetch(
"""
Collect a small number of rows for debugging purposes.

alexander-beedie marked this conversation as resolved.
Show resolved Hide resolved
Notes
-----
This is similar to a :func:`collect` operation, but it overwrites the number of
rows read by *every* scan operation. Be aware that `fetch` does not guarantee
the final number of rows in the DataFrame. Filters, join operations and fewer
rows being available in the scanned data will all influence the final number
of rows (joins are especially susceptible to this, and may return no data
at all if `n_rows` is too small as the join keys may not be present).

Warnings
--------
This is strictly a utility function that can help to debug queries using a
smaller number of rows, and should *not* be used in production code.
"""
return self._fetch(
n_rows=n_rows,
type_coercion=type_coercion,
predicate_pushdown=predicate_pushdown,
projection_pushdown=projection_pushdown,
simplify_expression=simplify_expression,
no_optimization=no_optimization,
slice_pushdown=slice_pushdown,
comm_subplan_elim=comm_subplan_elim,
comm_subexpr_elim=comm_subexpr_elim,
cluster_with_columns=cluster_with_columns,
streaming=streaming,
)

def _fetch(
self,
n_rows: int = 500,
*,
type_coercion: bool = True,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
comm_subplan_elim: bool = True,
comm_subexpr_elim: bool = True,
cluster_with_columns: bool = True,
streaming: bool = False,
) -> DataFrame:
"""
Collect a small number of rows for debugging purposes.

Do not confuse with `collect`; this function will frequently return
incorrect data (see the warning for additional details).

Parameters
----------
n_rows
Expand Down Expand Up @@ -2655,7 +2710,7 @@ def fetch(
... "c": [6, 5, 4, 3, 2, 1],
... }
... )
>>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2)
>>> lf.group_by("a", maintain_order=True).agg(pl.all().sum())._fetch(2)
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
Expand Down
4 changes: 2 additions & 2 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,8 @@ def test_fetch_union(tmp_path: Path) -> None:
df1.write_parquet(file_path_1)
df2.write_parquet(file_path_2)

result_one = pl.scan_parquet(file_path_1).fetch(1)
result_glob = pl.scan_parquet(file_path_glob).fetch(1)
result_one = pl.scan_parquet(file_path_1)._fetch(1)
result_glob = pl.scan_parquet(file_path_glob)._fetch(1)

expected = pl.DataFrame({"a": [0], "b": [1]})
assert_frame_equal(result_one, expected)
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/lazyframe/test_lazyframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def test_inspect(capsys: CaptureFixture[str]) -> None:


def test_fetch(fruits_cars: pl.DataFrame) -> None:
res = fruits_cars.lazy().select("*").fetch(2)
res = fruits_cars.lazy().select("*")._fetch(2)
assert_frame_equal(res, res[:2])


Expand Down