diff --git a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst index 0cdf5096e445..2a0ab647766d 100644 --- a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst +++ b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst @@ -10,7 +10,6 @@ Miscellaneous LazyFrame.collect LazyFrame.collect_async LazyFrame.collect_schema - LazyFrame.fetch LazyFrame.lazy LazyFrame.map_batches LazyFrame.pipe diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index a660e26fb18c..367839030f9b 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2378,8 +2378,10 @@ def to_init_repr(self, n: int = 1000) -> str: def serialize( self, file: None = ..., *, format: Literal["binary"] = ... ) -> bytes: ... + @overload def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ... + @overload def serialize( self, file: IOBase | str | Path, *, format: SerializationFormat = ... @@ -8451,8 +8453,6 @@ def lazy(self) -> LazyFrame: Operations on a `LazyFrame` are not executed until this is requested by either calling: - * :meth:`.fetch() ` - (run on a small number of rows) * :meth:`.collect() ` (run on all data) * :meth:`.describe_plan() ` @@ -8461,9 +8461,11 @@ def lazy(self) -> LazyFrame: (print optimized query plan) * :meth:`.show_graph() ` (show (un)optimized query plan as graphviz graph) + * :meth:`.collect_schema() ` + (return the final frame schema) - Lazy operations are advised because they allow for query optimization and more - parallelization. + Lazy operations are recommended because they allow for query optimization and + additional parallelism. Returns ------- diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index a3518a2d68e7..bfa3d3719211 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -1073,7 +1073,7 @@ def scan_csv( ... .filter( ... pl.col("a") > 10 ... ) # the filter is pushed down the scan, so less data is read into memory - ... .fetch(100) # pushed a limit of 100 rows to the scan level + ... .head(100) # constrain number of returned results to 100 ... ) # doctest: +SKIP We can use `with_column_names` to modify the header before scanning: diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 171ce9afd169..bdcf20c9ad4b 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -651,8 +651,10 @@ def _repr_html_(self) -> str: def serialize( self, file: None = ..., *, format: Literal["binary"] = ... ) -> bytes: ... + @overload def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ... + @overload def serialize( self, file: IOBase | str | Path, *, format: SerializationFormat = ... @@ -2581,6 +2583,11 @@ def _set_sink_optimizations( new_streaming=False, ) + @deprecate_function( + "`LazyFrame.fetch` is deprecated; use `LazyFrame.collect` " + "instead, in conjunction with a call to `head`.", + version="1.0", + ) def fetch( self, n_rows: int = 500, @@ -2599,6 +2606,58 @@ def fetch( """ Collect a small number of rows for debugging purposes. + .. deprecated:: 1.0 + Use :meth:`collect` instead, in conjunction with a call to :meth:`head`.` + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + """ + return self._fetch( + n_rows=n_rows, + type_coercion=type_coercion, + predicate_pushdown=predicate_pushdown, + projection_pushdown=projection_pushdown, + simplify_expression=simplify_expression, + no_optimization=no_optimization, + slice_pushdown=slice_pushdown, + comm_subplan_elim=comm_subplan_elim, + comm_subexpr_elim=comm_subexpr_elim, + cluster_with_columns=cluster_with_columns, + streaming=streaming, + ) + + def _fetch( + self, + n_rows: int = 500, + *, + type_coercion: bool = True, + predicate_pushdown: bool = True, + projection_pushdown: bool = True, + simplify_expression: bool = True, + no_optimization: bool = False, + slice_pushdown: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, + cluster_with_columns: bool = True, + streaming: bool = False, + ) -> DataFrame: + """ + Collect a small number of rows for debugging purposes. + + Do not confuse with `collect`; this function will frequently return + incorrect data (see the warning for additional details). + Parameters ---------- n_rows @@ -2655,7 +2714,7 @@ def fetch( ... "c": [6, 5, 4, 3, 2, 1], ... } ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum())._fetch(2) shape: (2, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 173cf84d56fd..cda90eb65077 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -437,8 +437,8 @@ def test_fetch_union(tmp_path: Path) -> None: df1.write_parquet(file_path_1) df2.write_parquet(file_path_2) - result_one = pl.scan_parquet(file_path_1).fetch(1) - result_glob = pl.scan_parquet(file_path_glob).fetch(1) + result_one = pl.scan_parquet(file_path_1)._fetch(1) + result_glob = pl.scan_parquet(file_path_glob)._fetch(1) expected = pl.DataFrame({"a": [0], "b": [1]}) assert_frame_equal(result_one, expected) diff --git a/py-polars/tests/unit/lazyframe/test_lazyframe.py b/py-polars/tests/unit/lazyframe/test_lazyframe.py index 78cc18898068..730a51d0b493 100644 --- a/py-polars/tests/unit/lazyframe/test_lazyframe.py +++ b/py-polars/tests/unit/lazyframe/test_lazyframe.py @@ -355,7 +355,7 @@ def test_inspect(capsys: CaptureFixture[str]) -> None: def test_fetch(fruits_cars: pl.DataFrame) -> None: - res = fruits_cars.lazy().select("*").fetch(2) + res = fruits_cars.lazy().select("*")._fetch(2) assert_frame_equal(res, res[:2])