From 89515e0c3e491e5c652d2290cc63681aeb2762c0 Mon Sep 17 00:00:00 2001
From: Martin Bomio <martinbomio@spotify.com>
Date: Sat, 15 Feb 2025 00:54:53 +0000
Subject: [PATCH] feat: allow hasher to run on append mode

Signed-off-by: Martin Bomio <martinbomio@spotify.com>
---
 python/ray/data/preprocessors/hasher.py       | 26 ++++++++++++++-----
 .../data/tests/preprocessors/test_hasher.py   | 19 +++++++++-----
 .../tests/preprocessors/test_preprocessors.py |  2 +-
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
index 7045f2cd60d3b..ae9ae769ad9b5 100644
--- a/python/ray/data/preprocessors/hasher.py
+++ b/python/ray/data/preprocessors/hasher.py
@@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor):
         :class:`FeatureHasher` hashes each token to determine its index. For example,
         the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
 
-        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
-        >>> hasher.fit_transform(ds).to_pandas().to_numpy()  # doctest: +SKIP
+        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
+        >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy()  # doctest: +SKIP
         array([[0, 0, 0, 2, 0, 1, 0, 0],
                [0, 0, 0, 1, 0, 1, 1, 0]])
 
@@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor):
         num_features: The number of features used to represent the vocabulary. You
             should choose a value large enough to prevent hash collisions between
             distinct tokens.
+        output_column: The name of the column that contains the hashed features.
 
     .. seealso::
         :class:`~ray.data.preprocessors.CountVectorizer`
@@ -75,11 +76,17 @@ class FeatureHasher(Preprocessor):
 
     _is_fittable = False
 
-    def __init__(self, columns: List[str], num_features: int):
+    def __init__(
+        self,
+        columns: List[str],
+        num_features: int,
+        output_column: str,
+    ):
         self.columns = columns
         # TODO(matt): Set default number of features.
         # This likely requires sparse matrix support to avoid explosion of columns.
         self.num_features = num_features
+        self.output_column = output_column
 
     def _transform_pandas(self, df: pd.DataFrame):
         # TODO(matt): Use sparse matrix for efficiency.
@@ -93,14 +100,19 @@ def row_feature_hasher(row):
         feature_columns = df.loc[:, self.columns].apply(
             row_feature_hasher, axis=1, result_type="expand"
         )
-        df = df.join(feature_columns)
 
-        # Drop original unhashed columns.
-        df.drop(columns=self.columns, inplace=True)
+        # Concatenate the hash columns
+        hash_columns = [f"hash_{i}" for i in range(self.num_features)]
+        concatenated = feature_columns[hash_columns].to_numpy()
+        # Use a Pandas Series for column assignment to get more consistent
+        # behavior across Pandas versions.
+        df.loc[:, self.output_column] = pd.Series(list(concatenated))
+
         return df
 
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(columns={self.columns!r}, "
-            f"num_features={self.num_features!r})"
+            f"num_features={self.num_features!r}, "
+            f"output_column={self.output_column!r})"
         )
diff --git a/python/ray/data/tests/preprocessors/test_hasher.py b/python/ray/data/tests/preprocessors/test_hasher.py
index 24f3f7754e984..5bbb84f60df0c 100644
--- a/python/ray/data/tests/preprocessors/test_hasher.py
+++ b/python/ray/data/tests/preprocessors/test_hasher.py
@@ -13,23 +13,30 @@ def test_feature_hasher():
         {"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
     )
 
-    hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
+    hasher = FeatureHasher(
+        ["I", "like", "dislike", "Python"],
+        num_features=256,
+        output_column="hashed_features",
+    )
     document_term_matrix = hasher.fit_transform(
         ray.data.from_pandas(token_counts)
     ).to_pandas()
 
+    hashed_features = document_term_matrix["hashed_features"]
     # Document-term matrix should have shape (# documents, # features)
-    assert document_term_matrix.shape == (2, 256)
+    assert hashed_features.shape == (2,)
 
     # The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
     # for adequately large `num_features`.
-    assert document_term_matrix.iloc[0].sum() == 3
-    assert all(document_term_matrix.iloc[0] <= 1)
+    assert len(hashed_features.iloc[0]) == 256
+    assert hashed_features.iloc[0].sum() == 3
+    assert all(hashed_features.iloc[0] <= 1)
 
     # The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
     # indices for adequately large `num_features`.
-    assert document_term_matrix.iloc[1].sum() == 3
-    assert all(document_term_matrix.iloc[1] <= 1)
+    assert len(hashed_features.iloc[1]) == 256
+    assert hashed_features.iloc[1].sum() == 3
+    assert all(hashed_features.iloc[1] <= 1)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/data/tests/preprocessors/test_preprocessors.py b/python/ray/data/tests/preprocessors/test_preprocessors.py
index c5ce492ce1109..0adc3d8a8889d 100644
--- a/python/ray/data/tests/preprocessors/test_preprocessors.py
+++ b/python/ray/data/tests/preprocessors/test_preprocessors.py
@@ -87,7 +87,7 @@ def preferred_batch_format(cls) -> BatchFormat:
         Categorizer(columns=["X"]),
         CountVectorizer(columns=["X"]),
         Chain(StandardScaler(columns=["X"]), MinMaxScaler(columns=["X"])),
-        FeatureHasher(columns=["X"], num_features=1),
+        FeatureHasher(columns=["X"], num_features=1, output_column="X_transformed"),
         HashingVectorizer(columns=["X"], num_features=1),
         LabelEncoder(label_column="X"),
         MaxAbsScaler(columns=["X"]),