pola-rs · ritchie46 · Jun 30, 2024 · Jun 28, 2024 · Jun 29, 2024
@@ -10,7 +10,6 @@ Miscellaneous
     LazyFrame.collect
     LazyFrame.collect_async
     LazyFrame.collect_schema
-    LazyFrame.fetch
     LazyFrame.lazy
     LazyFrame.map_batches
     LazyFrame.pipe

@@ -2378,8 +2378,10 @@ def to_init_repr(self, n: int = 1000) -> str:
     def serialize(
         self, file: None = ..., *, format: Literal["binary"] = ...
     ) -> bytes: ...
+
     @overload
     def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...
+
     @overload
     def serialize(
         self, file: IOBase | str | Path, *, format: SerializationFormat = ...
@@ -8451,8 +8453,6 @@ def lazy(self) -> LazyFrame:
         Operations on a `LazyFrame` are not executed until this is requested by either
         calling:
 
-        * :meth:`.fetch() <polars.LazyFrame.fetch>`
-            (run on a small number of rows)
         * :meth:`.collect() <polars.LazyFrame.collect>`
             (run on all data)
         * :meth:`.describe_plan() <polars.LazyFrame.describe_plan>`
@@ -8461,9 +8461,11 @@ def lazy(self) -> LazyFrame:
             (print optimized query plan)
         * :meth:`.show_graph() <polars.LazyFrame.show_graph>`
             (show (un)optimized query plan as graphviz graph)
+        * :meth:`.collect_schema() <polars.LazyFrame.collect_schema>`
+            (return the final frame schema)
 
-        Lazy operations are advised because they allow for query optimization and more
-        parallelization.
+        Lazy operations are recommended because they allow for query optimization and
+        additional parallelism.
 
         Returns
         -------

@@ -1073,7 +1073,7 @@ def scan_csv(
     ...     .filter(
     ...         pl.col("a") > 10
     ...     )  # the filter is pushed down the scan, so less data is read into memory
-    ...     .fetch(100)  # pushed a limit of 100 rows to the scan level
+    ...     .head(100)  # constrain number of returned results to 100
     ... )  # doctest: +SKIP
 
     We can use `with_column_names` to modify the header before scanning:

@@ -651,8 +651,10 @@ def _repr_html_(self) -> str:
     def serialize(
         self, file: None = ..., *, format: Literal["binary"] = ...
     ) -> bytes: ...
+
     @overload
     def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...
+
     @overload
     def serialize(
         self, file: IOBase | str | Path, *, format: SerializationFormat = ...
@@ -2581,6 +2583,10 @@ def _set_sink_optimizations(
             new_streaming=False,
         )
 
+    @deprecate_function(
+        "This will shortly become a private function; use is strongly discouraged",
+        version="1.0",
+    )
     def fetch(
         self,
         n_rows: int = 500,
@@ -2599,6 +2605,55 @@ def fetch(
         """
         Collect a small number of rows for debugging purposes.
 
+        Notes
+        -----
+        This is similar to a :func:`collect` operation, but it overwrites the number of
+        rows read by *every* scan operation. Be aware that `fetch` does not guarantee
+        the final number of rows in the DataFrame. Filters, join operations and fewer
+        rows being available in the scanned data will all influence the final number
+        of rows (joins are especially susceptible to this, and may return no data
+        at all if `n_rows` is too small as the join keys may not be present).
+
+        Warnings
+        --------
+        This is strictly a utility function that can help to debug queries using a
+        smaller number of rows, and should *not* be used in production code.
+        """
+        return self._fetch(
+            n_rows=n_rows,
+            type_coercion=type_coercion,
+            predicate_pushdown=predicate_pushdown,
+            projection_pushdown=projection_pushdown,
+            simplify_expression=simplify_expression,
+            no_optimization=no_optimization,
+            slice_pushdown=slice_pushdown,
+            comm_subplan_elim=comm_subplan_elim,
+            comm_subexpr_elim=comm_subexpr_elim,
+            cluster_with_columns=cluster_with_columns,
+            streaming=streaming,
+        )
+
+    def _fetch(
+        self,
+        n_rows: int = 500,
+        *,
+        type_coercion: bool = True,
+        predicate_pushdown: bool = True,
+        projection_pushdown: bool = True,
+        simplify_expression: bool = True,
+        no_optimization: bool = False,
+        slice_pushdown: bool = True,
+        comm_subplan_elim: bool = True,
+        comm_subexpr_elim: bool = True,
+        cluster_with_columns: bool = True,
+        streaming: bool = False,
+    ) -> DataFrame:
+        """
+        Collect a small number of rows for debugging purposes.
+
+        Do not confuse with `collect`; this function will frequently return
+        incorrect data (see the warning for additional details).
+
         Parameters
         ----------
         n_rows
@@ -2655,7 +2710,7 @@ def fetch(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2)
+        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum())._fetch(2)
         shape: (2, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │

@@ -437,8 +437,8 @@ def test_fetch_union(tmp_path: Path) -> None:
     df1.write_parquet(file_path_1)
     df2.write_parquet(file_path_2)
 
-    result_one = pl.scan_parquet(file_path_1).fetch(1)
-    result_glob = pl.scan_parquet(file_path_glob).fetch(1)
+    result_one = pl.scan_parquet(file_path_1)._fetch(1)
+    result_glob = pl.scan_parquet(file_path_glob)._fetch(1)
 
     expected = pl.DataFrame({"a": [0], "b": [1]})
     assert_frame_equal(result_one, expected)

@@ -355,7 +355,7 @@ def test_inspect(capsys: CaptureFixture[str]) -> None:
 
 
 def test_fetch(fruits_cars: pl.DataFrame) -> None:
-    res = fruits_cars.lazy().select("*").fetch(2)
+    res = fruits_cars.lazy().select("*")._fetch(2)
     assert_frame_equal(res, res[:2])