From 84ac01b3648a9692c0173c9d39977553e48e5d44 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 16 May 2024 12:53:50 +0200 Subject: [PATCH] depr(python): Change parameter `chunked` to `allow_chunks` in parametric testing strategies (#16264) --- .../testing/parametric/strategies/core.py | 49 +++++++++------ .../testing/parametric/strategies/data.py | 25 +++++--- .../testing/parametric/strategies/dtype.py | 61 ++++++++++++++----- .../tests/unit/interchange/test_roundtrip.py | 22 +++---- .../unit/series/buffers/test_from_buffer.py | 2 +- .../unit/series/buffers/test_from_buffers.py | 6 +- .../parametric/strategies/test_core.py | 36 +++++------ 7 files changed, 122 insertions(+), 79 deletions(-) diff --git a/py-polars/polars/testing/parametric/strategies/core.py b/py-polars/polars/testing/parametric/strategies/core.py index e1b8f6cc1bd9..cb8960231ebd 100644 --- a/py-polars/polars/testing/parametric/strategies/core.py +++ b/py-polars/polars/testing/parametric/strategies/core.py @@ -40,8 +40,8 @@ def series( # noqa: D417 max_size: int = _ROW_LIMIT, strategy: SearchStrategy[Any] | None = None, allow_null: bool = True, + allow_chunks: bool = True, unique: bool = False, - chunked: bool | None = None, allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, **kwargs: Any, @@ -69,11 +69,10 @@ def series( # noqa: D417 supports overriding the default strategy for the given dtype. allow_null : bool Allow nulls as possible values and allow the `Null` data type by default. + allow_chunks : bool + Allow the Series to contain multiple chunks. unique : bool, optional indicate whether Series values should all be distinct. - chunked : bool, optional - ensure that Series with more than one element have `n_chunks` > 1. - if omitted, chunking is applied at random. allowed_dtypes : {list,set}, optional when automatically generating Series data, allow only these dtypes. excluded_dtypes : {list,set}, optional @@ -137,6 +136,12 @@ def series( # noqa: D417 version="0.20.26", ) kwargs["allow_infinity"] = allow_inf + if (chunked := kwargs.pop("chunked", None)) is not None: + issue_deprecation_warning( + "`chunked` is deprecated. Use `allow_chunks` instead.", + version="0.20.26", + ) + allow_chunks = chunked if isinstance(allowed_dtypes, (DataType, DataTypeClass)): allowed_dtypes = [allowed_dtypes] @@ -196,12 +201,9 @@ def series( # noqa: D417 s = Series(name=name, values=values, dtype=dtype) # Apply chunking - if size > 1: - if chunked is None: - chunked = draw(st.booleans()) - if chunked: - split_at = size // 2 - s = s[:split_at].append(s[split_at:]) + if allow_chunks and size > 1 and draw(st.booleans()): + split_at = size // 2 + s = s[:split_at].append(s[split_at:]) return s @@ -216,9 +218,9 @@ def dataframes( size: int | None = None, min_size: int = 0, max_size: int = _ROW_LIMIT, - chunked: bool | None = None, include_cols: Sequence[column] | column | None = None, allow_null: bool | Mapping[str, bool] = True, + allow_chunks: bool = True, allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, **kwargs: Any, @@ -235,9 +237,9 @@ def dataframes( size: int | None = None, min_size: int = 0, max_size: int = _ROW_LIMIT, - chunked: bool | None = None, include_cols: Sequence[column] | column | None = None, allow_null: bool | Mapping[str, bool] = True, + allow_chunks: bool = True, allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, **kwargs: Any, @@ -256,9 +258,9 @@ def dataframes( # noqa: D417 size: int | None = None, min_size: int = 0, max_size: int = _ROW_LIMIT, - chunked: bool | None = None, include_cols: Sequence[column] | column | None = None, allow_null: bool | Mapping[str, bool] = True, + allow_chunks: bool = True, allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None, **kwargs: Any, @@ -287,9 +289,6 @@ def dataframes( # noqa: D417 max_size : int, optional if not passing an exact size, set the maximum number of rows in the DataFrame. - chunked : bool, optional - ensure that DataFrames with more than row have `n_chunks` > 1. if - omitted, chunking will be randomised at the level of individual Series. include_cols : [column], optional a list of `column` objects to include in the generated DataFrame. note that explicitly provided columns are appended onto the list of existing columns @@ -297,6 +296,8 @@ def dataframes( # noqa: D417 allow_null : bool or Mapping[str, bool] Allow nulls as possible values and allow the `Null` data type by default. Accepts either a boolean or a mapping of column names to booleans. + allow_chunks : bool + Allow the DataFrame to contain multiple chunks. allowed_dtypes : {list,set}, optional when automatically generating data, allow only these dtypes. excluded_dtypes : {list,set}, optional @@ -384,6 +385,12 @@ def dataframes( # noqa: D417 version="0.20.26", ) kwargs["allow_infinity"] = allow_inf + if (chunked := kwargs.pop("chunked", None)) is not None: + issue_deprecation_warning( + "`chunked` is deprecated. Use `allow_chunks` instead.", + version="0.20.26", + ) + allow_chunks = chunked if isinstance(include_cols, column): include_cols = [include_cols] @@ -414,6 +421,8 @@ def dataframes( # noqa: D417 else: c.allow_null = allow_null + allow_series_chunks = draw(st.booleans()) if allow_chunks else False + with StringCache(): data = { c.name: draw( @@ -421,10 +430,10 @@ def dataframes( # noqa: D417 name=c.name, dtype=c.dtype, size=size, - allow_null=c.allow_null, # type: ignore[arg-type] strategy=c.strategy, + allow_null=c.allow_null, # type: ignore[arg-type] + allow_chunks=allow_series_chunks, unique=c.unique, - chunked=None if chunked is None else False, allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes, **kwargs, @@ -435,8 +444,8 @@ def dataframes( # noqa: D417 df = DataFrame(data) - # Optionally generate chunked frames - if size > 1 and chunked: + # Apply chunking + if allow_chunks and size > 1 and not allow_series_chunks and draw(st.booleans()): split_at = size // 2 df = df[:split_at].vstack(df[split_at:]) diff --git a/py-polars/polars/testing/parametric/strategies/data.py b/py-polars/polars/testing/parametric/strategies/data.py index 3c7f9c116db6..23faa1e5a3cf 100644 --- a/py-polars/polars/testing/parametric/strategies/data.py +++ b/py-polars/polars/testing/parametric/strategies/data.py @@ -61,7 +61,7 @@ if TYPE_CHECKING: from datetime import date, datetime, time - from hypothesis.strategies import DrawFn, SearchStrategy + from hypothesis.strategies import SearchStrategy from polars.datatypes import DataType, DataTypeClass from polars.type_aliases import PolarsDataType, SchemaDict, TimeUnit @@ -281,10 +281,12 @@ def lists( ) -@st.composite -def structs( # noqa: D417 - draw: DrawFn, /, fields: Sequence[Field] | SchemaDict, **kwargs: Any -) -> dict[str, Any]: +def structs( + fields: Sequence[Field] | SchemaDict, + *, + allow_null: bool = True, + **kwargs: Any, +) -> SearchStrategy[dict[str, Any]]: """ Create a strategy for generating structs with the given fields. @@ -293,14 +295,21 @@ def structs( # noqa: D417 fields The fields that make up the struct. Can be either a sequence of Field objects or a mapping of column names to data types. + allow_null + Allow nulls as possible values. If set to True, the returned dictionaries + may miss certain fields and are in random order. **kwargs Additional arguments that are passed to nested data generation strategies. """ if isinstance(fields, Mapping): fields = [Field(name, dtype) for name, dtype in fields.items()] - strats = {f.name: data(f.dtype, **kwargs) for f in fields} - return {col: draw(strat) for col, strat in strats.items()} + strats = {f.name: data(f.dtype, allow_null=allow_null, **kwargs) for f in fields} + + if allow_null: + return st.fixed_dictionaries({}, optional=strats) + else: + return st.fixed_dictionaries(strats) def nulls() -> SearchStrategy[None]: @@ -394,7 +403,7 @@ def data( ) elif dtype == Struct: fields = getattr(dtype, "fields", None) or [Field("f0", Null())] - strategy = structs(fields, **kwargs) + strategy = structs(fields, allow_null=allow_null, **kwargs) else: msg = f"unsupported data type: {dtype}" raise InvalidArgument(msg) diff --git a/py-polars/polars/testing/parametric/strategies/dtype.py b/py-polars/polars/testing/parametric/strategies/dtype.py index 835bb4d8103d..cde2656788d4 100644 --- a/py-polars/polars/testing/parametric/strategies/dtype.py +++ b/py-polars/polars/testing/parametric/strategies/dtype.py @@ -59,6 +59,8 @@ Date, Time, Null, + # TODO: Enable Object types by default when various issues are solved. + # Object, ] # Supported data type classes with arguments _COMPLEX_DTYPES: list[DataTypeClass] = [ @@ -104,7 +106,9 @@ def dtypes( The complexity of nested data types. If set to 0, nested data types are disabled. """ - flat_dtypes, nested_dtypes = _parse_allowed_dtypes(allowed_dtypes) + flat_dtypes, nested_dtypes, excluded_dtypes = _parse_dtype_restrictions( + allowed_dtypes, excluded_dtypes + ) if nesting_level > 0 and nested_dtypes: if not flat_dtypes: @@ -126,22 +130,49 @@ def dtypes( return _flat_dtypes(allowed_dtypes=flat_dtypes, excluded_dtypes=excluded_dtypes) -def _parse_allowed_dtypes( +def _parse_dtype_restrictions( allowed_dtypes: Collection[PolarsDataType] | None = None, -) -> tuple[Sequence[PolarsDataType], Sequence[PolarsDataType]]: - """Split allowed dtypes into flat and nested data types.""" - if allowed_dtypes is None: - return _FLAT_DTYPES, _NESTED_DTYPES - - allowed_dtypes_flat = [] - allowed_dtypes_nested = [] - for dt in allowed_dtypes: - if dt.is_nested(): - allowed_dtypes_nested.append(dt) - else: - allowed_dtypes_flat.append(dt) + excluded_dtypes: Sequence[PolarsDataType] | None = None, +) -> tuple[list[PolarsDataType], list[PolarsDataType], list[DataType]]: + """ + Parse data type restrictions. - return allowed_dtypes_flat, allowed_dtypes_nested + Splits allowed data types into flat and nested data types. + Filters the allowed data types by excluded data type classes. + Excluded instantiated data types are returned to be filtered later. + """ + # Split excluded dtypes into instances and classes + excluded_dtypes_instance = [] + excluded_dtypes_class = [] + if excluded_dtypes: + for dt in excluded_dtypes: + if isinstance(dt, DataType): + excluded_dtypes_instance.append(dt) + else: + excluded_dtypes_class.append(dt) + + # Split allowed dtypes into flat and nested, excluding certain dtype classes + allowed_dtypes_flat: list[PolarsDataType] + allowed_dtypes_nested: list[PolarsDataType] + if allowed_dtypes is None: + allowed_dtypes_flat = [ + dt for dt in _FLAT_DTYPES if dt not in excluded_dtypes_class + ] + allowed_dtypes_nested = [ + dt for dt in _NESTED_DTYPES if dt not in excluded_dtypes_class + ] + else: + allowed_dtypes_flat = [] + allowed_dtypes_nested = [] + for dt in allowed_dtypes: + if dt in excluded_dtypes_class: + continue + elif dt.is_nested(): + allowed_dtypes_nested.append(dt) + else: + allowed_dtypes_flat.append(dt) + + return allowed_dtypes_flat, allowed_dtypes_nested, excluded_dtypes_instance @st.composite diff --git a/py-polars/tests/unit/interchange/test_roundtrip.py b/py-polars/tests/unit/interchange/test_roundtrip.py index e0bb7f0d65cc..a600009f642a 100644 --- a/py-polars/tests/unit/interchange/test_roundtrip.py +++ b/py-polars/tests/unit/interchange/test_roundtrip.py @@ -13,7 +13,7 @@ from polars.testing import assert_frame_equal from polars.testing.parametric import dataframes -integer_dtypes: list[pl.PolarsDataType] = [ +protocol_dtypes: list[pl.PolarsDataType] = [ pl.Int8, pl.Int16, pl.Int32, @@ -22,8 +22,6 @@ pl.UInt16, pl.UInt32, pl.UInt64, -] -protocol_dtypes: list[pl.PolarsDataType] = integer_dtypes + [ pl.Float32, pl.Float64, pl.Boolean, @@ -58,7 +56,7 @@ def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None: pl.String, # Polars String type does not match protocol spec pl.Categorical, ], - chunked=False, + allow_chunks=False, ) ) def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None: @@ -103,7 +101,7 @@ def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None: pl.String, # Polars String type does not match protocol spec pl.Categorical, ], - chunked=False, + allow_chunks=False, allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190 ) ) @@ -136,7 +134,7 @@ def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None: pl.Categorical, # Polars copies the categories to construct a mapping pl.Boolean, # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991 ], - chunked=False, + allow_chunks=False, ) ) def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None: @@ -167,9 +165,7 @@ def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None: @given( dataframes( - allowed_dtypes=( - integer_dtypes + [pl.Datetime] # Smaller selection to improve performance - ), + allowed_dtypes=protocol_dtypes, excluded_dtypes=[ pl.String, # Polars String type does not match protocol spec pl.Categorical, # Categoricals come back as Enums @@ -180,7 +176,7 @@ def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None: # Empty dataframes cause an error due to a bug in pandas. # https://github.com/pandas-dev/pandas/issues/56700 min_size=1, - chunked=False, + allow_chunks=False, ) ) @pytest.mark.skipif( @@ -215,9 +211,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None: @given( dataframes( - allowed_dtypes=( - integer_dtypes + [pl.Datetime] # Smaller selection to improve performance - ), + allowed_dtypes=protocol_dtypes, excluded_dtypes=[ pl.String, # Polars String type does not match protocol spec pl.Categorical, # Categoricals come back as Enums @@ -228,7 +222,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None: # Empty dataframes cause an error due to a bug in pandas. # https://github.com/pandas-dev/pandas/issues/56700 min_size=1, - chunked=False, + allow_chunks=False, allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190 ) ) diff --git a/py-polars/tests/unit/series/buffers/test_from_buffer.py b/py-polars/tests/unit/series/buffers/test_from_buffer.py index 5eeecb3adf35..99588293dd42 100644 --- a/py-polars/tests/unit/series/buffers/test_from_buffer.py +++ b/py-polars/tests/unit/series/buffers/test_from_buffer.py @@ -13,7 +13,7 @@ @given( s=series( allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}), - chunked=False, + allow_chunks=False, allow_null=False, ) ) diff --git a/py-polars/tests/unit/series/buffers/test_from_buffers.py b/py-polars/tests/unit/series/buffers/test_from_buffers.py index 497591e94a5c..1c2c64c9a84e 100644 --- a/py-polars/tests/unit/series/buffers/test_from_buffers.py +++ b/py-polars/tests/unit/series/buffers/test_from_buffers.py @@ -24,7 +24,7 @@ @given( s=series( allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}), - chunked=False, + allow_chunks=False, ) ) def test_series_from_buffers_numeric_with_validity(s: pl.Series) -> None: @@ -36,7 +36,7 @@ def test_series_from_buffers_numeric_with_validity(s: pl.Series) -> None: @given( s=series( allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}), - chunked=False, + allow_chunks=False, allow_null=False, ) ) @@ -45,7 +45,7 @@ def test_series_from_buffers_numeric(s: pl.Series) -> None: assert_series_equal(s, result) -@given(s=series(allowed_dtypes=TEMPORAL_DTYPES, chunked=False)) +@given(s=series(allowed_dtypes=TEMPORAL_DTYPES, allow_chunks=False)) def test_series_from_buffers_temporal_with_validity(s: pl.Series) -> None: validity = s.is_not_null() physical = pl.Int32 if s.dtype == pl.Date else pl.Int64 diff --git a/py-polars/tests/unit/testing/parametric/strategies/test_core.py b/py-polars/tests/unit/testing/parametric/strategies/test_core.py index 08aac7fb75a1..68409a0d7462 100644 --- a/py-polars/tests/unit/testing/parametric/strategies/test_core.py +++ b/py-polars/tests/unit/testing/parametric/strategies/test_core.py @@ -244,26 +244,17 @@ def test_strategy_dtypes( assert not s2.dtype.is_temporal() -@given( - df1=dataframes(chunked=False, min_size=1), - df2=dataframes(chunked=True, min_size=1), - s1=series(chunked=False, min_size=1), - s2=series(chunked=True, min_size=1), -) +@given(s=series(allow_chunks=False)) @settings(max_examples=10) -def test_chunking( - df1: pl.DataFrame, - df2: pl.DataFrame, - s1: pl.Series, - s2: pl.Series, -) -> None: - assert df1.n_chunks() == 1 - if len(df2) > 1: - assert df2.n_chunks("all") == [2] * len(df2.columns) +def test_series_allow_chunks(s: pl.Series) -> None: + assert s.n_chunks() == 1 + - assert s1.n_chunks() == 1 - if len(s2) > 1: - assert s2.n_chunks() > 1 +@given(df=dataframes(allow_chunks=False)) +@settings(max_examples=10) +def test_dataframes_allow_chunks(df: pl.DataFrame) -> None: + assert df.n_chunks("first") == 1 + assert df.n_chunks("all") == [1] * df.width @given( @@ -302,3 +293,12 @@ def test_dataframes_allowed_dtypes_integer_cols(df: pl.DataFrame) -> None: assert all( tp in (pl.Int8, pl.UInt16, pl.List(pl.Int32)) for tp in df.schema.values() ) + + +@given(st.data()) +@settings(max_examples=1) +def test_series_chunked_deprecated(data: st.DataObject) -> None: + with pytest.deprecated_call(): + data.draw(series(chunked=True)) + with pytest.deprecated_call(): + data.draw(dataframes(chunked=True))