[Data] Fix Discretizers transforming ignored cols (#31404)

Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> This PR fixes a bug in Discretizer Preprocessors where all of the columns in a Dataset would be transformed instead of just the specified ones. This lead to exceptions due to KeyErrors later.
ray-project · Jan 3, 2023 · 0d1e576 · 0d1e576
1 parent 0c3d32d
commit 0d1e576
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 4 deletions.
diff --git a/python/ray/data/preprocessors/discretizer.py b/python/ray/data/preprocessors/discretizer.py
@@ -20,6 +20,8 @@ class _AbstractKBinsDiscretizer(Preprocessor):
 
     def _transform_pandas(self, df: pd.DataFrame):
         def bin_values(s: pd.Series) -> pd.Series:
+            if s.name not in self.columns:
+                return s
             labels = self.dtypes.get(s.name) if self.dtypes else False
             ordered = True
             if labels:

diff --git a/python/ray/data/tests/preprocessors/test_discretizer.py b/python/ray/data/tests/preprocessors/test_discretizer.py
@@ -25,8 +25,9 @@ def test_uniform_kbins_discretizer(
     """Tests basic UniformKBinsDiscretizer functionality."""
 
     col_a = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
-    col_b = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
-    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
+    col_b = col_a.copy()
+    col_c = col_a.copy()
+    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
     ds = ray.data.from_pandas(in_df).repartition(2)
 
     discretizer = UniformKBinsDiscretizer(
@@ -74,6 +75,8 @@ def test_uniform_kbins_discretizer(
             include_lowest=include_lowest,
         )
     )
+    # Check that the remaining column was not modified
+    assert out_df["C"].equals(in_df["C"])
 
 
 @pytest.mark.parametrize(
@@ -98,8 +101,9 @@ def test_custom_kbins_discretizer(
     """Tests basic CustomKBinsDiscretizer functionality."""
 
     col_a = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
-    col_b = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
-    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
+    col_b = col_a.copy()
+    col_c = col_a.copy()
+    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b, "C": col_c})
     ds = ray.data.from_pandas(in_df).repartition(2)
 
     discretizer = CustomKBinsDiscretizer(
@@ -147,6 +151,8 @@ def test_custom_kbins_discretizer(
             include_lowest=include_lowest,
         )
     )
+    # Check that the remaining column was not modified
+    assert out_df["C"].equals(in_df["C"])
 
 
 if __name__ == "__main__":