depr(python): Change parameter chunked to allow_chunks in paramet…

…ric testing strategies (#16264)
pola-rs · May 16, 2024 · 84ac01b · 84ac01b
1 parent 98a2d9b
commit 84ac01b
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 79 deletions.
diff --git a/py-polars/polars/testing/parametric/strategies/core.py b/py-polars/polars/testing/parametric/strategies/core.py
@@ -40,8 +40,8 @@ def series(  # noqa: D417
     max_size: int = _ROW_LIMIT,
     strategy: SearchStrategy[Any] | None = None,
     allow_null: bool = True,
+    allow_chunks: bool = True,
     unique: bool = False,
-    chunked: bool | None = None,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -69,11 +69,10 @@ def series(  # noqa: D417
         supports overriding the default strategy for the given dtype.
     allow_null : bool
         Allow nulls as possible values and allow the `Null` data type by default.
+    allow_chunks : bool
+        Allow the Series to contain multiple chunks.
     unique : bool, optional
         indicate whether Series values should all be distinct.
-    chunked : bool, optional
-        ensure that Series with more than one element have `n_chunks` > 1.
-        if omitted, chunking is applied at random.
     allowed_dtypes : {list,set}, optional
         when automatically generating Series data, allow only these dtypes.
     excluded_dtypes : {list,set}, optional
@@ -137,6 +136,12 @@ def series(  # noqa: D417
             version="0.20.26",
         )
         kwargs["allow_infinity"] = allow_inf
+    if (chunked := kwargs.pop("chunked", None)) is not None:
+        issue_deprecation_warning(
+            "`chunked` is deprecated. Use `allow_chunks` instead.",
+            version="0.20.26",
+        )
+        allow_chunks = chunked
 
     if isinstance(allowed_dtypes, (DataType, DataTypeClass)):
         allowed_dtypes = [allowed_dtypes]
@@ -196,12 +201,9 @@ def series(  # noqa: D417
     s = Series(name=name, values=values, dtype=dtype)
 
     # Apply chunking
-    if size > 1:
-        if chunked is None:
-            chunked = draw(st.booleans())
-        if chunked:
-            split_at = size // 2
-            s = s[:split_at].append(s[split_at:])
+    if allow_chunks and size > 1 and draw(st.booleans()):
+        split_at = size // 2
+        s = s[:split_at].append(s[split_at:])
 
     return s
 
@@ -216,9 +218,9 @@ def dataframes(
     size: int | None = None,
     min_size: int = 0,
     max_size: int = _ROW_LIMIT,
-    chunked: bool | None = None,
     include_cols: Sequence[column] | column | None = None,
     allow_null: bool | Mapping[str, bool] = True,
+    allow_chunks: bool = True,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -235,9 +237,9 @@ def dataframes(
     size: int | None = None,
     min_size: int = 0,
     max_size: int = _ROW_LIMIT,
-    chunked: bool | None = None,
     include_cols: Sequence[column] | column | None = None,
     allow_null: bool | Mapping[str, bool] = True,
+    allow_chunks: bool = True,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -256,9 +258,9 @@ def dataframes(  # noqa: D417
     size: int | None = None,
     min_size: int = 0,
     max_size: int = _ROW_LIMIT,
-    chunked: bool | None = None,
     include_cols: Sequence[column] | column | None = None,
     allow_null: bool | Mapping[str, bool] = True,
+    allow_chunks: bool = True,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -287,16 +289,15 @@ def dataframes(  # noqa: D417
     max_size : int, optional
         if not passing an exact size, set the maximum number of rows in the
         DataFrame.
-    chunked : bool, optional
-        ensure that DataFrames with more than row have `n_chunks` > 1. if
-        omitted, chunking will be randomised at the level of individual Series.
     include_cols : [column], optional
         a list of `column` objects to include in the generated DataFrame. note that
         explicitly provided columns are appended onto the list of existing columns
         (if any present).
     allow_null : bool or Mapping[str, bool]
         Allow nulls as possible values and allow the `Null` data type by default.
         Accepts either a boolean or a mapping of column names to booleans.
+    allow_chunks : bool
+        Allow the DataFrame to contain multiple chunks.
     allowed_dtypes : {list,set}, optional
         when automatically generating data, allow only these dtypes.
     excluded_dtypes : {list,set}, optional
@@ -384,6 +385,12 @@ def dataframes(  # noqa: D417
             version="0.20.26",
         )
         kwargs["allow_infinity"] = allow_inf
+    if (chunked := kwargs.pop("chunked", None)) is not None:
+        issue_deprecation_warning(
+            "`chunked` is deprecated. Use `allow_chunks` instead.",
+            version="0.20.26",
+        )
+        allow_chunks = chunked
 
     if isinstance(include_cols, column):
         include_cols = [include_cols]
@@ -414,17 +421,19 @@ def dataframes(  # noqa: D417
             else:
                 c.allow_null = allow_null
 
+    allow_series_chunks = draw(st.booleans()) if allow_chunks else False
+
     with StringCache():
         data = {
             c.name: draw(
                 series(
                     name=c.name,
                     dtype=c.dtype,
                     size=size,
-                    allow_null=c.allow_null,  # type: ignore[arg-type]
                     strategy=c.strategy,
+                    allow_null=c.allow_null,  # type: ignore[arg-type]
+                    allow_chunks=allow_series_chunks,
                     unique=c.unique,
-                    chunked=None if chunked is None else False,
                     allowed_dtypes=allowed_dtypes,
                     excluded_dtypes=excluded_dtypes,
                     **kwargs,
@@ -435,8 +444,8 @@ def dataframes(  # noqa: D417
 
     df = DataFrame(data)
 
-    # Optionally generate chunked frames
-    if size > 1 and chunked:
+    # Apply chunking
+    if allow_chunks and size > 1 and not allow_series_chunks and draw(st.booleans()):
         split_at = size // 2
         df = df[:split_at].vstack(df[split_at:])
 

diff --git a/py-polars/polars/testing/parametric/strategies/data.py b/py-polars/polars/testing/parametric/strategies/data.py
@@ -61,7 +61,7 @@
 if TYPE_CHECKING:
     from datetime import date, datetime, time
 
-    from hypothesis.strategies import DrawFn, SearchStrategy
+    from hypothesis.strategies import SearchStrategy
 
     from polars.datatypes import DataType, DataTypeClass
     from polars.type_aliases import PolarsDataType, SchemaDict, TimeUnit
@@ -281,10 +281,12 @@ def lists(
     )
 
 
-@st.composite
-def structs(  # noqa: D417
-    draw: DrawFn, /, fields: Sequence[Field] | SchemaDict, **kwargs: Any
-) -> dict[str, Any]:
+def structs(
+    fields: Sequence[Field] | SchemaDict,
+    *,
+    allow_null: bool = True,
+    **kwargs: Any,
+) -> SearchStrategy[dict[str, Any]]:
     """
     Create a strategy for generating structs with the given fields.
 
@@ -293,14 +295,21 @@ def structs(  # noqa: D417
     fields
         The fields that make up the struct. Can be either a sequence of Field
         objects or a mapping of column names to data types.
+    allow_null
+        Allow nulls as possible values. If set to True, the returned dictionaries
+        may miss certain fields and are in random order.
     **kwargs
         Additional arguments that are passed to nested data generation strategies.
     """
     if isinstance(fields, Mapping):
         fields = [Field(name, dtype) for name, dtype in fields.items()]
 
-    strats = {f.name: data(f.dtype, **kwargs) for f in fields}
-    return {col: draw(strat) for col, strat in strats.items()}
+    strats = {f.name: data(f.dtype, allow_null=allow_null, **kwargs) for f in fields}
+
+    if allow_null:
+        return st.fixed_dictionaries({}, optional=strats)
+    else:
+        return st.fixed_dictionaries(strats)
 
 
 def nulls() -> SearchStrategy[None]:
@@ -394,7 +403,7 @@ def data(
         )
     elif dtype == Struct:
         fields = getattr(dtype, "fields", None) or [Field("f0", Null())]
-        strategy = structs(fields, **kwargs)
+        strategy = structs(fields, allow_null=allow_null, **kwargs)
     else:
         msg = f"unsupported data type: {dtype}"
         raise InvalidArgument(msg)

diff --git a/py-polars/polars/testing/parametric/strategies/dtype.py b/py-polars/polars/testing/parametric/strategies/dtype.py
@@ -59,6 +59,8 @@
     Date,
     Time,
     Null,
+    # TODO: Enable Object types by default when various issues are solved.
+    # Object,
 ]
 # Supported data type classes with arguments
 _COMPLEX_DTYPES: list[DataTypeClass] = [
@@ -104,7 +106,9 @@ def dtypes(
         The complexity of nested data types. If set to 0, nested data types are
         disabled.
     """
-    flat_dtypes, nested_dtypes = _parse_allowed_dtypes(allowed_dtypes)
+    flat_dtypes, nested_dtypes, excluded_dtypes = _parse_dtype_restrictions(
+        allowed_dtypes, excluded_dtypes
+    )
 
     if nesting_level > 0 and nested_dtypes:
         if not flat_dtypes:
@@ -126,22 +130,49 @@ def dtypes(
         return _flat_dtypes(allowed_dtypes=flat_dtypes, excluded_dtypes=excluded_dtypes)
 
 
-def _parse_allowed_dtypes(
+def _parse_dtype_restrictions(
     allowed_dtypes: Collection[PolarsDataType] | None = None,
-) -> tuple[Sequence[PolarsDataType], Sequence[PolarsDataType]]:
-    """Split allowed dtypes into flat and nested data types."""
-    if allowed_dtypes is None:
-        return _FLAT_DTYPES, _NESTED_DTYPES
-
-    allowed_dtypes_flat = []
-    allowed_dtypes_nested = []
-    for dt in allowed_dtypes:
-        if dt.is_nested():
-            allowed_dtypes_nested.append(dt)
-        else:
-            allowed_dtypes_flat.append(dt)
+    excluded_dtypes: Sequence[PolarsDataType] | None = None,
+) -> tuple[list[PolarsDataType], list[PolarsDataType], list[DataType]]:
+    """
+    Parse data type restrictions.
 
-    return allowed_dtypes_flat, allowed_dtypes_nested
+    Splits allowed data types into flat and nested data types.
+    Filters the allowed data types by excluded data type classes.
+    Excluded instantiated data types are returned to be filtered later.
+    """
+    # Split excluded dtypes into instances and classes
+    excluded_dtypes_instance = []
+    excluded_dtypes_class = []
+    if excluded_dtypes:
+        for dt in excluded_dtypes:
+            if isinstance(dt, DataType):
+                excluded_dtypes_instance.append(dt)
+            else:
+                excluded_dtypes_class.append(dt)
+
+    # Split allowed dtypes into flat and nested, excluding certain dtype classes
+    allowed_dtypes_flat: list[PolarsDataType]
+    allowed_dtypes_nested: list[PolarsDataType]
+    if allowed_dtypes is None:
+        allowed_dtypes_flat = [
+            dt for dt in _FLAT_DTYPES if dt not in excluded_dtypes_class
+        ]
+        allowed_dtypes_nested = [
+            dt for dt in _NESTED_DTYPES if dt not in excluded_dtypes_class
+        ]
+    else:
+        allowed_dtypes_flat = []
+        allowed_dtypes_nested = []
+        for dt in allowed_dtypes:
+            if dt in excluded_dtypes_class:
+                continue
+            elif dt.is_nested():
+                allowed_dtypes_nested.append(dt)
+            else:
+                allowed_dtypes_flat.append(dt)
+
+    return allowed_dtypes_flat, allowed_dtypes_nested, excluded_dtypes_instance
 
 
 @st.composite

diff --git a/py-polars/tests/unit/interchange/test_roundtrip.py b/py-polars/tests/unit/interchange/test_roundtrip.py
@@ -13,7 +13,7 @@
 from polars.testing import assert_frame_equal
 from polars.testing.parametric import dataframes
 
-integer_dtypes: list[pl.PolarsDataType] = [
+protocol_dtypes: list[pl.PolarsDataType] = [
     pl.Int8,
     pl.Int16,
     pl.Int32,
@@ -22,8 +22,6 @@
     pl.UInt16,
     pl.UInt32,
     pl.UInt64,
-]
-protocol_dtypes: list[pl.PolarsDataType] = integer_dtypes + [
     pl.Float32,
     pl.Float64,
     pl.Boolean,
@@ -58,7 +56,7 @@ def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
             pl.String,  # Polars String type does not match protocol spec
             pl.Categorical,
         ],
-        chunked=False,
+        allow_chunks=False,
     )
 )
 def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
@@ -103,7 +101,7 @@ def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
             pl.String,  # Polars String type does not match protocol spec
             pl.Categorical,
         ],
-        chunked=False,
+        allow_chunks=False,
         allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
     )
 )
@@ -136,7 +134,7 @@ def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
             pl.Categorical,  # Polars copies the categories to construct a mapping
             pl.Boolean,  # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991
         ],
-        chunked=False,
+        allow_chunks=False,
     )
 )
 def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
@@ -167,9 +165,7 @@ def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
 
 @given(
     dataframes(
-        allowed_dtypes=(
-            integer_dtypes + [pl.Datetime]  # Smaller selection to improve performance
-        ),
+        allowed_dtypes=protocol_dtypes,
         excluded_dtypes=[
             pl.String,  # Polars String type does not match protocol spec
             pl.Categorical,  # Categoricals come back as Enums
@@ -180,7 +176,7 @@ def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
         # Empty dataframes cause an error due to a bug in pandas.
         # https://github.com/pandas-dev/pandas/issues/56700
         min_size=1,
-        chunked=False,
+        allow_chunks=False,
     )
 )
 @pytest.mark.skipif(
@@ -215,9 +211,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
 
 @given(
     dataframes(
-        allowed_dtypes=(
-            integer_dtypes + [pl.Datetime]  # Smaller selection to improve performance
-        ),
+        allowed_dtypes=protocol_dtypes,
         excluded_dtypes=[
             pl.String,  # Polars String type does not match protocol spec
             pl.Categorical,  # Categoricals come back as Enums
@@ -228,7 +222,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
         # Empty dataframes cause an error due to a bug in pandas.
         # https://github.com/pandas-dev/pandas/issues/56700
         min_size=1,
-        chunked=False,
+        allow_chunks=False,
         allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
     )
 )

diff --git a/py-polars/tests/unit/series/buffers/test_from_buffer.py b/py-polars/tests/unit/series/buffers/test_from_buffer.py
@@ -13,7 +13,7 @@
 @given(
     s=series(
         allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}),
-        chunked=False,
+        allow_chunks=False,
         allow_null=False,
     )
 )