Skip to content

Commit

Permalink
feat: add subset parameter to DataFrame.dropna to select which co…
Browse files Browse the repository at this point in the history
…lumns to consider (#981)

* feat: add `subset` parameter to `DataFrame.dropna` to select which columns to consider

* fix dropna with subset=None

* refactor: remove circular dependencies preventing local doctest runs

With this change I can once again run

```
pytest --doctest-modules third_party/bigframes_vendored/pandas/core/frame.py
```

Note: having multiple `version.py` files should be fine. release-please
will update all such files it finds.

* fix doctest

* Revert "Merge branch 'tswast-circular-import' into b366248570-dropna-subset"

This reverts commit 57e8335, reversing
changes made to 197074a.

* Reapply "Merge branch 'tswast-circular-import' into b366248570-dropna-subset"

This reverts commit 0f18294.

* loop over tuple result

---------

Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com>
  • Loading branch information
tswast and Genesis929 authored Sep 16, 2024
1 parent deac6d2 commit f7c03dc
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 12 deletions.
12 changes: 10 additions & 2 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import functools
import typing
from typing import Sequence
from typing import Optional, Sequence

import bigframes_vendored.constants as constants
import pandas as pd
Expand Down Expand Up @@ -488,11 +488,19 @@ def dropna(
block: blocks.Block,
column_ids: typing.Sequence[str],
how: typing.Literal["all", "any"] = "any",
subset: Optional[typing.Sequence[str]] = None,
):
"""
Drop na entries from block
"""
predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids]
if subset is None:
subset = column_ids

predicates = [
ops.notnull_op.as_expr(column_id)
for column_id in column_ids
if column_id in subset
]
if len(predicates) == 0:
return block
if how == "any":
Expand Down
22 changes: 20 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2027,8 +2027,9 @@ def dropna(
self,
*,
axis: int | str = 0,
inplace: bool = False,
how: str = "any",
subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
inplace: bool = False,
ignore_index=False,
) -> DataFrame:
if inplace:
Expand All @@ -2040,8 +2041,25 @@ def dropna(

axis_n = utils.get_axis_number(axis)

if subset is not None and axis_n != 0:
raise NotImplementedError(
f"subset only supported when axis=0. {constants.FEEDBACK_LINK}"
)

if axis_n == 0:
result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore
# subset needs to be converted into column IDs, not column labels.
if subset is None:
subset_ids = None
elif not utils.is_list_like(subset):
subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]]
else:
subset_ids = [
id_
for label in subset
for id_ in self._block.label_to_col_id[label]
]

result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore
if ignore_index:
result = result.reset_index()
return DataFrame(result)
Expand Down
21 changes: 13 additions & 8 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs):

@skip_legacy_pandas
@pytest.mark.parametrize(
("axis", "how", "ignore_index"),
("axis", "how", "ignore_index", "subset"),
[
(0, "any", False),
(0, "any", True),
(1, "any", False),
(1, "all", False),
(0, "any", False, None),
(0, "any", True, None),
(0, "all", False, ["bool_col", "time_col"]),
(0, "any", False, ["bool_col", "time_col"]),
(0, "all", False, "time_col"),
(1, "any", False, None),
(1, "all", False, None),
],
)
def test_df_dropna(scalars_dfs, axis, how, ignore_index):
def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
scalars_df, scalars_pandas_df = scalars_dfs
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
bf_result = df.to_pandas()
pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
pd_result = scalars_pandas_df.dropna(
axis=axis, how=how, ignore_index=ignore_index, subset=subset
)

# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@
from . import resources


def test_dataframe_dropna_axis_1_subset_not_implememented(
monkeypatch: pytest.MonkeyPatch,
):
dataframe = resources.create_dataframe(monkeypatch)

with pytest.raises(NotImplementedError, match="subset"):
dataframe.dropna(axis=1, subset=["col1", "col2"])


def test_dataframe_repr_with_uninitialized_object():
"""Ensures DataFrame.__init__ can be paused in a visual debugger without crashing.
Expand Down
17 changes: 17 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,8 @@ def dropna(
*,
axis: int | str = 0,
how: str = "any",
subset=None,
inplace: bool = False,
ignore_index=False,
) -> DataFrame:
"""Remove missing values.
Expand Down Expand Up @@ -1662,6 +1664,15 @@ def dropna(
<BLANKLINE>
[3 rows x 3 columns]
Define in which columns to look for missing values.
>>> df.dropna(subset=['name', 'toy'])
name toy born
1 Batman Batmobile 1940-04-25
2 Catwoman Bullwhip <NA>
<BLANKLINE>
[2 rows x 3 columns]
Args:
axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
Determine if rows or columns which contain missing values are
Expand All @@ -1675,6 +1686,12 @@ def dropna(
* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
subset (column label or sequence of labels, optional):
Labels along other axis to consider, e.g. if you are dropping
rows these would be a list of columns to include.
Only supports axis=0.
inplace (bool, default ``False``):
Not supported.
ignore_index (bool, default ``False``):
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
Expand Down

0 comments on commit f7c03dc

Please sign in to comment.