Skip to content

Commit

Permalink
depr(python): Change parameter chunked to allow_chunks in paramet…
Browse files Browse the repository at this point in the history
…ric testing strategies (#16264)
  • Loading branch information
stinodego authored May 16, 2024
1 parent 98a2d9b commit 84ac01b
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 79 deletions.
49 changes: 29 additions & 20 deletions py-polars/polars/testing/parametric/strategies/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def series( # noqa: D417
max_size: int = _ROW_LIMIT,
strategy: SearchStrategy[Any] | None = None,
allow_null: bool = True,
allow_chunks: bool = True,
unique: bool = False,
chunked: bool | None = None,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand Down Expand Up @@ -69,11 +69,10 @@ def series( # noqa: D417
supports overriding the default strategy for the given dtype.
allow_null : bool
Allow nulls as possible values and allow the `Null` data type by default.
allow_chunks : bool
Allow the Series to contain multiple chunks.
unique : bool, optional
indicate whether Series values should all be distinct.
chunked : bool, optional
ensure that Series with more than one element have `n_chunks` > 1.
if omitted, chunking is applied at random.
allowed_dtypes : {list,set}, optional
when automatically generating Series data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -137,6 +136,12 @@ def series( # noqa: D417
version="0.20.26",
)
kwargs["allow_infinity"] = allow_inf
if (chunked := kwargs.pop("chunked", None)) is not None:
issue_deprecation_warning(
"`chunked` is deprecated. Use `allow_chunks` instead.",
version="0.20.26",
)
allow_chunks = chunked

if isinstance(allowed_dtypes, (DataType, DataTypeClass)):
allowed_dtypes = [allowed_dtypes]
Expand Down Expand Up @@ -196,12 +201,9 @@ def series( # noqa: D417
s = Series(name=name, values=values, dtype=dtype)

# Apply chunking
if size > 1:
if chunked is None:
chunked = draw(st.booleans())
if chunked:
split_at = size // 2
s = s[:split_at].append(s[split_at:])
if allow_chunks and size > 1 and draw(st.booleans()):
split_at = size // 2
s = s[:split_at].append(s[split_at:])

return s

Expand All @@ -216,9 +218,9 @@ def dataframes(
size: int | None = None,
min_size: int = 0,
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = True,
allow_chunks: bool = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand All @@ -235,9 +237,9 @@ def dataframes(
size: int | None = None,
min_size: int = 0,
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = True,
allow_chunks: bool = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand All @@ -256,9 +258,9 @@ def dataframes( # noqa: D417
size: int | None = None,
min_size: int = 0,
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = True,
allow_chunks: bool = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand Down Expand Up @@ -287,16 +289,15 @@ def dataframes( # noqa: D417
max_size : int, optional
if not passing an exact size, set the maximum number of rows in the
DataFrame.
chunked : bool, optional
ensure that DataFrames with more than row have `n_chunks` > 1. if
omitted, chunking will be randomised at the level of individual Series.
include_cols : [column], optional
a list of `column` objects to include in the generated DataFrame. note that
explicitly provided columns are appended onto the list of existing columns
(if any present).
allow_null : bool or Mapping[str, bool]
Allow nulls as possible values and allow the `Null` data type by default.
Accepts either a boolean or a mapping of column names to booleans.
allow_chunks : bool
Allow the DataFrame to contain multiple chunks.
allowed_dtypes : {list,set}, optional
when automatically generating data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -384,6 +385,12 @@ def dataframes( # noqa: D417
version="0.20.26",
)
kwargs["allow_infinity"] = allow_inf
if (chunked := kwargs.pop("chunked", None)) is not None:
issue_deprecation_warning(
"`chunked` is deprecated. Use `allow_chunks` instead.",
version="0.20.26",
)
allow_chunks = chunked

if isinstance(include_cols, column):
include_cols = [include_cols]
Expand Down Expand Up @@ -414,17 +421,19 @@ def dataframes( # noqa: D417
else:
c.allow_null = allow_null

allow_series_chunks = draw(st.booleans()) if allow_chunks else False

with StringCache():
data = {
c.name: draw(
series(
name=c.name,
dtype=c.dtype,
size=size,
allow_null=c.allow_null, # type: ignore[arg-type]
strategy=c.strategy,
allow_null=c.allow_null, # type: ignore[arg-type]
allow_chunks=allow_series_chunks,
unique=c.unique,
chunked=None if chunked is None else False,
allowed_dtypes=allowed_dtypes,
excluded_dtypes=excluded_dtypes,
**kwargs,
Expand All @@ -435,8 +444,8 @@ def dataframes( # noqa: D417

df = DataFrame(data)

# Optionally generate chunked frames
if size > 1 and chunked:
# Apply chunking
if allow_chunks and size > 1 and not allow_series_chunks and draw(st.booleans()):
split_at = size // 2
df = df[:split_at].vstack(df[split_at:])

Expand Down
25 changes: 17 additions & 8 deletions py-polars/polars/testing/parametric/strategies/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
if TYPE_CHECKING:
from datetime import date, datetime, time

from hypothesis.strategies import DrawFn, SearchStrategy
from hypothesis.strategies import SearchStrategy

from polars.datatypes import DataType, DataTypeClass
from polars.type_aliases import PolarsDataType, SchemaDict, TimeUnit
Expand Down Expand Up @@ -281,10 +281,12 @@ def lists(
)


@st.composite
def structs( # noqa: D417
draw: DrawFn, /, fields: Sequence[Field] | SchemaDict, **kwargs: Any
) -> dict[str, Any]:
def structs(
fields: Sequence[Field] | SchemaDict,
*,
allow_null: bool = True,
**kwargs: Any,
) -> SearchStrategy[dict[str, Any]]:
"""
Create a strategy for generating structs with the given fields.
Expand All @@ -293,14 +295,21 @@ def structs( # noqa: D417
fields
The fields that make up the struct. Can be either a sequence of Field
objects or a mapping of column names to data types.
allow_null
Allow nulls as possible values. If set to True, the returned dictionaries
may miss certain fields and are in random order.
**kwargs
Additional arguments that are passed to nested data generation strategies.
"""
if isinstance(fields, Mapping):
fields = [Field(name, dtype) for name, dtype in fields.items()]

strats = {f.name: data(f.dtype, **kwargs) for f in fields}
return {col: draw(strat) for col, strat in strats.items()}
strats = {f.name: data(f.dtype, allow_null=allow_null, **kwargs) for f in fields}

if allow_null:
return st.fixed_dictionaries({}, optional=strats)
else:
return st.fixed_dictionaries(strats)


def nulls() -> SearchStrategy[None]:
Expand Down Expand Up @@ -394,7 +403,7 @@ def data(
)
elif dtype == Struct:
fields = getattr(dtype, "fields", None) or [Field("f0", Null())]
strategy = structs(fields, **kwargs)
strategy = structs(fields, allow_null=allow_null, **kwargs)
else:
msg = f"unsupported data type: {dtype}"
raise InvalidArgument(msg)
Expand Down
61 changes: 46 additions & 15 deletions py-polars/polars/testing/parametric/strategies/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
Date,
Time,
Null,
# TODO: Enable Object types by default when various issues are solved.
# Object,
]
# Supported data type classes with arguments
_COMPLEX_DTYPES: list[DataTypeClass] = [
Expand Down Expand Up @@ -104,7 +106,9 @@ def dtypes(
The complexity of nested data types. If set to 0, nested data types are
disabled.
"""
flat_dtypes, nested_dtypes = _parse_allowed_dtypes(allowed_dtypes)
flat_dtypes, nested_dtypes, excluded_dtypes = _parse_dtype_restrictions(
allowed_dtypes, excluded_dtypes
)

if nesting_level > 0 and nested_dtypes:
if not flat_dtypes:
Expand All @@ -126,22 +130,49 @@ def dtypes(
return _flat_dtypes(allowed_dtypes=flat_dtypes, excluded_dtypes=excluded_dtypes)


def _parse_allowed_dtypes(
def _parse_dtype_restrictions(
allowed_dtypes: Collection[PolarsDataType] | None = None,
) -> tuple[Sequence[PolarsDataType], Sequence[PolarsDataType]]:
"""Split allowed dtypes into flat and nested data types."""
if allowed_dtypes is None:
return _FLAT_DTYPES, _NESTED_DTYPES

allowed_dtypes_flat = []
allowed_dtypes_nested = []
for dt in allowed_dtypes:
if dt.is_nested():
allowed_dtypes_nested.append(dt)
else:
allowed_dtypes_flat.append(dt)
excluded_dtypes: Sequence[PolarsDataType] | None = None,
) -> tuple[list[PolarsDataType], list[PolarsDataType], list[DataType]]:
"""
Parse data type restrictions.
return allowed_dtypes_flat, allowed_dtypes_nested
Splits allowed data types into flat and nested data types.
Filters the allowed data types by excluded data type classes.
Excluded instantiated data types are returned to be filtered later.
"""
# Split excluded dtypes into instances and classes
excluded_dtypes_instance = []
excluded_dtypes_class = []
if excluded_dtypes:
for dt in excluded_dtypes:
if isinstance(dt, DataType):
excluded_dtypes_instance.append(dt)
else:
excluded_dtypes_class.append(dt)

# Split allowed dtypes into flat and nested, excluding certain dtype classes
allowed_dtypes_flat: list[PolarsDataType]
allowed_dtypes_nested: list[PolarsDataType]
if allowed_dtypes is None:
allowed_dtypes_flat = [
dt for dt in _FLAT_DTYPES if dt not in excluded_dtypes_class
]
allowed_dtypes_nested = [
dt for dt in _NESTED_DTYPES if dt not in excluded_dtypes_class
]
else:
allowed_dtypes_flat = []
allowed_dtypes_nested = []
for dt in allowed_dtypes:
if dt in excluded_dtypes_class:
continue
elif dt.is_nested():
allowed_dtypes_nested.append(dt)
else:
allowed_dtypes_flat.append(dt)

return allowed_dtypes_flat, allowed_dtypes_nested, excluded_dtypes_instance


@st.composite
Expand Down
22 changes: 8 additions & 14 deletions py-polars/tests/unit/interchange/test_roundtrip.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from polars.testing import assert_frame_equal
from polars.testing.parametric import dataframes

integer_dtypes: list[pl.PolarsDataType] = [
protocol_dtypes: list[pl.PolarsDataType] = [
pl.Int8,
pl.Int16,
pl.Int32,
Expand All @@ -22,8 +22,6 @@
pl.UInt16,
pl.UInt32,
pl.UInt64,
]
protocol_dtypes: list[pl.PolarsDataType] = integer_dtypes + [
pl.Float32,
pl.Float64,
pl.Boolean,
Expand Down Expand Up @@ -58,7 +56,7 @@ def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
pl.String, # Polars String type does not match protocol spec
pl.Categorical,
],
chunked=False,
allow_chunks=False,
)
)
def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
Expand Down Expand Up @@ -103,7 +101,7 @@ def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
pl.String, # Polars String type does not match protocol spec
pl.Categorical,
],
chunked=False,
allow_chunks=False,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
Expand Down Expand Up @@ -136,7 +134,7 @@ def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
pl.Categorical, # Polars copies the categories to construct a mapping
pl.Boolean, # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991
],
chunked=False,
allow_chunks=False,
)
)
def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
Expand Down Expand Up @@ -167,9 +165,7 @@ def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:

@given(
dataframes(
allowed_dtypes=(
integer_dtypes + [pl.Datetime] # Smaller selection to improve performance
),
allowed_dtypes=protocol_dtypes,
excluded_dtypes=[
pl.String, # Polars String type does not match protocol spec
pl.Categorical, # Categoricals come back as Enums
Expand All @@ -180,7 +176,7 @@ def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
# Empty dataframes cause an error due to a bug in pandas.
# https://github.com/pandas-dev/pandas/issues/56700
min_size=1,
chunked=False,
allow_chunks=False,
)
)
@pytest.mark.skipif(
Expand Down Expand Up @@ -215,9 +211,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:

@given(
dataframes(
allowed_dtypes=(
integer_dtypes + [pl.Datetime] # Smaller selection to improve performance
),
allowed_dtypes=protocol_dtypes,
excluded_dtypes=[
pl.String, # Polars String type does not match protocol spec
pl.Categorical, # Categoricals come back as Enums
Expand All @@ -228,7 +222,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
# Empty dataframes cause an error due to a bug in pandas.
# https://github.com/pandas-dev/pandas/issues/56700
min_size=1,
chunked=False,
allow_chunks=False,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/series/buffers/test_from_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@given(
s=series(
allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}),
chunked=False,
allow_chunks=False,
allow_null=False,
)
)
Expand Down
Loading

0 comments on commit 84ac01b

Please sign in to comment.