[data/preprocessors] feat: allow hasher to run on append mode (ray-pr…

…oject#50632)   ## Why are these changes needed? This is part of ray-project#48133. Continuing the approach taken in ray-project#49426, make all the hashers work in append mode ## Related issue number ray-project#49426 ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've included any doc changes needed for https://docs.ray.io/en/master/. - [x] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Martin Bomio <martinbomio@spotify.com>
Michaelhess17 · Mar 3, 2025 · b482532 · b482532
1 parent 09c2434
commit b482532
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 14 deletions.
diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
@@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor):
         :class:`FeatureHasher` hashes each token to determine its index. For example,
         the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
 
-        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
-        >>> hasher.fit_transform(ds).to_pandas().to_numpy()  # doctest: +SKIP
+        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
+        >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy()  # doctest: +SKIP
         array([[0, 0, 0, 2, 0, 1, 0, 0],
                [0, 0, 0, 1, 0, 1, 1, 0]])
 
@@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor):
         num_features: The number of features used to represent the vocabulary. You
             should choose a value large enough to prevent hash collisions between
             distinct tokens.
+        output_column: The name of the column that contains the hashed features.
 
     .. seealso::
         :class:`~ray.data.preprocessors.CountVectorizer`
@@ -75,11 +76,17 @@ class FeatureHasher(Preprocessor):
 
     _is_fittable = False
 
-    def __init__(self, columns: List[str], num_features: int):
+    def __init__(
+        self,
+        columns: List[str],
+        num_features: int,
+        output_column: str,
+    ):
         self.columns = columns
         # TODO(matt): Set default number of features.
         # This likely requires sparse matrix support to avoid explosion of columns.
         self.num_features = num_features
+        self.output_column = output_column
 
     def _transform_pandas(self, df: pd.DataFrame):
         # TODO(matt): Use sparse matrix for efficiency.
@@ -93,14 +100,19 @@ def row_feature_hasher(row):
         feature_columns = df.loc[:, self.columns].apply(
             row_feature_hasher, axis=1, result_type="expand"
         )
-        df = df.join(feature_columns)
 
-        # Drop original unhashed columns.
-        df.drop(columns=self.columns, inplace=True)
+        # Concatenate the hash columns
+        hash_columns = [f"hash_{i}" for i in range(self.num_features)]
+        concatenated = feature_columns[hash_columns].to_numpy()
+        # Use a Pandas Series for column assignment to get more consistent
+        # behavior across Pandas versions.
+        df.loc[:, self.output_column] = pd.Series(list(concatenated))
+
         return df
 
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(columns={self.columns!r}, "
-            f"num_features={self.num_features!r})"
+            f"num_features={self.num_features!r}, "
+            f"output_column={self.output_column!r})"
         )
diff --git a/python/ray/data/tests/preprocessors/test_hasher.py b/python/ray/data/tests/preprocessors/test_hasher.py
@@ -13,23 +13,30 @@ def test_feature_hasher():
         {"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
     )
 
-    hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
+    hasher = FeatureHasher(
+        ["I", "like", "dislike", "Python"],
+        num_features=256,
+        output_column="hashed_features",
+    )
     document_term_matrix = hasher.fit_transform(
         ray.data.from_pandas(token_counts)
     ).to_pandas()
 
+    hashed_features = document_term_matrix["hashed_features"]
     # Document-term matrix should have shape (# documents, # features)
-    assert document_term_matrix.shape == (2, 256)
+    assert hashed_features.shape == (2,)
 
     # The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
     # for adequately large `num_features`.
-    assert document_term_matrix.iloc[0].sum() == 3
-    assert all(document_term_matrix.iloc[0] <= 1)
+    assert len(hashed_features.iloc[0]) == 256
+    assert hashed_features.iloc[0].sum() == 3
+    assert all(hashed_features.iloc[0] <= 1)
 
     # The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
     # indices for adequately large `num_features`.
-    assert document_term_matrix.iloc[1].sum() == 3
-    assert all(document_term_matrix.iloc[1] <= 1)
+    assert len(hashed_features.iloc[1]) == 256
+    assert hashed_features.iloc[1].sum() == 3
+    assert all(hashed_features.iloc[1] <= 1)
 
 
 if __name__ == "__main__":

diff --git a/python/ray/data/tests/preprocessors/test_preprocessors.py b/python/ray/data/tests/preprocessors/test_preprocessors.py
@@ -87,7 +87,7 @@ def preferred_batch_format(cls) -> BatchFormat:
         Categorizer(columns=["X"]),
         CountVectorizer(columns=["X"]),
         Chain(StandardScaler(columns=["X"]), MinMaxScaler(columns=["X"])),
-        FeatureHasher(columns=["X"], num_features=1),
+        FeatureHasher(columns=["X"], num_features=1, output_column="X_transformed"),
         HashingVectorizer(columns=["X"], num_features=1),
         LabelEncoder(label_column="X"),
         MaxAbsScaler(columns=["X"]),