From 23ff71d51321676d6856d76eb385b50739d12d12 Mon Sep 17 00:00:00 2001 From: Martin Bomio Date: Sat, 15 Feb 2025 00:54:53 +0000 Subject: [PATCH] feat: allow hasher to run on append mode Signed-off-by: Martin Bomio --- python/ray/data/preprocessors/hasher.py | 28 +++++++++++---- .../data/tests/preprocessors/test_hasher.py | 35 ++++++++++++++++++- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 7045f2cd60d3b..7cd49fe2504a2 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -1,5 +1,5 @@ import collections -from typing import List +from typing import List, Optional import pandas as pd @@ -75,11 +75,22 @@ class FeatureHasher(Preprocessor): _is_fittable = False - def __init__(self, columns: List[str], num_features: int): + def __init__( + self, + columns: List[str], + num_features: int, + output_columns: Optional[List[str]] = None, + ): self.columns = columns # TODO(matt): Set default number of features. # This likely requires sparse matrix support to avoid explosion of columns. self.num_features = num_features + if output_columns and len(output_columns) != num_features: + raise ValueError( + "Invalid output_columns: Got num_features != len(output_columns)." + "The number of features and output_columns must match." + ) + self.output_columns = output_columns def _transform_pandas(self, df: pd.DataFrame): # TODO(matt): Use sparse matrix for efficiency. @@ -88,19 +99,24 @@ def row_feature_hasher(row): for column in self.columns: hashed_value = simple_hash(column, self.num_features) hash_counts[hashed_value] += row[column] - return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)} + return { + f"hash_{output_column}": hash_counts[i] + for output_column in self.output_columns + } feature_columns = df.loc[:, self.columns].apply( row_feature_hasher, axis=1, result_type="expand" ) df = df.join(feature_columns) - # Drop original unhashed columns. - df.drop(columns=self.columns, inplace=True) + # Drop original unhashed columns if output_columns is None. + if not self.output_columns: + df.drop(columns=self.columns, inplace=True) return df def __repr__(self): return ( f"{self.__class__.__name__}(columns={self.columns!r}, " - f"num_features={self.num_features!r})" + f"num_features={self.num_features!r}, " + f"output_columns={self.output_columns!r})" ) diff --git a/python/ray/data/tests/preprocessors/test_hasher.py b/python/ray/data/tests/preprocessors/test_hasher.py index 03985ff4f64c0..d0c7b6631c61f 100644 --- a/python/ray/data/tests/preprocessors/test_hasher.py +++ b/python/ray/data/tests/preprocessors/test_hasher.py @@ -31,6 +31,28 @@ def test_feature_hasher(): assert document_term_matrix.iloc[1].sum() == 3 assert all(document_term_matrix.iloc[1] <= 1) + # Test append mode + with pytest.raises(ValueError): + FeatureHasher( + columns=["I", "like", "dislike", "Python"], + num_features=256, + output_columns=["B_encoded"], + ) + + hasher_append = FeatureHasher( + ["I", "like", "dislike", "Python"], + num_features=256, + output_columns=[f"hash_{i}" for i in range(256)], + ) + document_term_matrix_append = hasher_append.fit_transform( + ray.data.from_pandas(token_counts) + ).to_pandas() + + assert document_term_matrix_append.shape == ( + 2, + 256 + 4, + ) # original columns + hashed columns + def test_hashing_vectorizer(): """Tests basic HashingVectorizer functionality.""" @@ -40,7 +62,18 @@ def test_hashing_vectorizer(): in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b}) ds = ray.data.from_pandas(in_df) - vectorizer = HashingVectorizer(["A", "B"], num_features=3) + vectorizer = HashingVectorizer( + ["A", "B"], + num_features=3, + output_columns=[ + "hash_A_0", + "hash_A_1", + "hash_A_2", + "hash_B_0", + "hash_B_1", + "hash_B_2", + ], + ) transformed = vectorizer.transform(ds) out_df = transformed.to_pandas()