From 89515e0c3e491e5c652d2290cc63681aeb2762c0 Mon Sep 17 00:00:00 2001 From: Martin Bomio Date: Sat, 15 Feb 2025 00:54:53 +0000 Subject: [PATCH] feat: allow hasher to run on append mode Signed-off-by: Martin Bomio --- python/ray/data/preprocessors/hasher.py | 26 ++++++++++++++----- .../data/tests/preprocessors/test_hasher.py | 19 +++++++++----- .../tests/preprocessors/test_preprocessors.py | 2 +- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 7045f2cd60d3b..ae9ae769ad9b5 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor): :class:`FeatureHasher` hashes each token to determine its index. For example, the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`. - >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8) - >>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP + >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed") + >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy() # doctest: +SKIP array([[0, 0, 0, 2, 0, 1, 0, 0], [0, 0, 0, 1, 0, 1, 1, 0]]) @@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor): num_features: The number of features used to represent the vocabulary. You should choose a value large enough to prevent hash collisions between distinct tokens. + output_column: The name of the column that contains the hashed features. .. seealso:: :class:`~ray.data.preprocessors.CountVectorizer` @@ -75,11 +76,17 @@ class FeatureHasher(Preprocessor): _is_fittable = False - def __init__(self, columns: List[str], num_features: int): + def __init__( + self, + columns: List[str], + num_features: int, + output_column: str, + ): self.columns = columns # TODO(matt): Set default number of features. # This likely requires sparse matrix support to avoid explosion of columns. self.num_features = num_features + self.output_column = output_column def _transform_pandas(self, df: pd.DataFrame): # TODO(matt): Use sparse matrix for efficiency. @@ -93,14 +100,19 @@ def row_feature_hasher(row): feature_columns = df.loc[:, self.columns].apply( row_feature_hasher, axis=1, result_type="expand" ) - df = df.join(feature_columns) - # Drop original unhashed columns. - df.drop(columns=self.columns, inplace=True) + # Concatenate the hash columns + hash_columns = [f"hash_{i}" for i in range(self.num_features)] + concatenated = feature_columns[hash_columns].to_numpy() + # Use a Pandas Series for column assignment to get more consistent + # behavior across Pandas versions. + df.loc[:, self.output_column] = pd.Series(list(concatenated)) + return df def __repr__(self): return ( f"{self.__class__.__name__}(columns={self.columns!r}, " - f"num_features={self.num_features!r})" + f"num_features={self.num_features!r}, " + f"output_column={self.output_column!r})" ) diff --git a/python/ray/data/tests/preprocessors/test_hasher.py b/python/ray/data/tests/preprocessors/test_hasher.py index 24f3f7754e984..5bbb84f60df0c 100644 --- a/python/ray/data/tests/preprocessors/test_hasher.py +++ b/python/ray/data/tests/preprocessors/test_hasher.py @@ -13,23 +13,30 @@ def test_feature_hasher(): {"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]} ) - hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256) + hasher = FeatureHasher( + ["I", "like", "dislike", "Python"], + num_features=256, + output_column="hashed_features", + ) document_term_matrix = hasher.fit_transform( ray.data.from_pandas(token_counts) ).to_pandas() + hashed_features = document_term_matrix["hashed_features"] # Document-term matrix should have shape (# documents, # features) - assert document_term_matrix.shape == (2, 256) + assert hashed_features.shape == (2,) # The tokens tokens "I", "like", and "Python" should be hashed to distinct indices # for adequately large `num_features`. - assert document_term_matrix.iloc[0].sum() == 3 - assert all(document_term_matrix.iloc[0] <= 1) + assert len(hashed_features.iloc[0]) == 256 + assert hashed_features.iloc[0].sum() == 3 + assert all(hashed_features.iloc[0] <= 1) # The tokens tokens "I", "dislike", and "Python" should be hashed to distinct # indices for adequately large `num_features`. - assert document_term_matrix.iloc[1].sum() == 3 - assert all(document_term_matrix.iloc[1] <= 1) + assert len(hashed_features.iloc[1]) == 256 + assert hashed_features.iloc[1].sum() == 3 + assert all(hashed_features.iloc[1] <= 1) if __name__ == "__main__": diff --git a/python/ray/data/tests/preprocessors/test_preprocessors.py b/python/ray/data/tests/preprocessors/test_preprocessors.py index c5ce492ce1109..0adc3d8a8889d 100644 --- a/python/ray/data/tests/preprocessors/test_preprocessors.py +++ b/python/ray/data/tests/preprocessors/test_preprocessors.py @@ -87,7 +87,7 @@ def preferred_batch_format(cls) -> BatchFormat: Categorizer(columns=["X"]), CountVectorizer(columns=["X"]), Chain(StandardScaler(columns=["X"]), MinMaxScaler(columns=["X"])), - FeatureHasher(columns=["X"], num_features=1), + FeatureHasher(columns=["X"], num_features=1, output_column="X_transformed"), HashingVectorizer(columns=["X"], num_features=1), LabelEncoder(label_column="X"), MaxAbsScaler(columns=["X"]),