Skip to content

Commit

Permalink
[Data] Fix Discretizers transforming ignored cols (#31404)
Browse files Browse the repository at this point in the history
Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>

This PR fixes a bug in Discretizer Preprocessors where all of the columns in a Dataset would be transformed instead of just the specified ones. This lead to exceptions due to KeyErrors later.
  • Loading branch information
Yard1 authored Jan 3, 2023
1 parent 0c3d32d commit 0d1e576
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
2 changes: 2 additions & 0 deletions python/ray/data/preprocessors/discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class _AbstractKBinsDiscretizer(Preprocessor):

def _transform_pandas(self, df: pd.DataFrame):
def bin_values(s: pd.Series) -> pd.Series:
if s.name not in self.columns:
return s
labels = self.dtypes.get(s.name) if self.dtypes else False
ordered = True
if labels:
Expand Down
14 changes: 10 additions & 4 deletions python/ray/data/tests/preprocessors/test_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ def test_uniform_kbins_discretizer(
"""Tests basic UniformKBinsDiscretizer functionality."""

col_a = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
col_b = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
col_b = col_a.copy()
col_c = col_a.copy()
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
ds = ray.data.from_pandas(in_df).repartition(2)

discretizer = UniformKBinsDiscretizer(
Expand Down Expand Up @@ -74,6 +75,8 @@ def test_uniform_kbins_discretizer(
include_lowest=include_lowest,
)
)
# Check that the remaining column was not modified
assert out_df["C"].equals(in_df["C"])


@pytest.mark.parametrize(
Expand All @@ -98,8 +101,9 @@ def test_custom_kbins_discretizer(
"""Tests basic CustomKBinsDiscretizer functionality."""

col_a = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
col_b = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
col_b = col_a.copy()
col_c = col_a.copy()
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
ds = ray.data.from_pandas(in_df).repartition(2)

discretizer = CustomKBinsDiscretizer(
Expand Down Expand Up @@ -147,6 +151,8 @@ def test_custom_kbins_discretizer(
include_lowest=include_lowest,
)
)
# Check that the remaining column was not modified
assert out_df["C"].equals(in_df["C"])


if __name__ == "__main__":
Expand Down

0 comments on commit 0d1e576

Please sign in to comment.