Skip to content

Commit

Permalink
feat: allow hasher to run on append mode
Browse files Browse the repository at this point in the history
Signed-off-by: Martin Bomio <martinbomio@spotify.com>
  • Loading branch information
martinbomio committed Feb 15, 2025
1 parent bdeeaa6 commit 23ff71d
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 7 deletions.
28 changes: 22 additions & 6 deletions python/ray/data/preprocessors/hasher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import collections
from typing import List
from typing import List, Optional

import pandas as pd

Expand Down Expand Up @@ -75,11 +75,22 @@ class FeatureHasher(Preprocessor):

_is_fittable = False

def __init__(self, columns: List[str], num_features: int):
def __init__(
self,
columns: List[str],
num_features: int,
output_columns: Optional[List[str]] = None,
):
self.columns = columns
# TODO(matt): Set default number of features.
# This likely requires sparse matrix support to avoid explosion of columns.
self.num_features = num_features
if output_columns and len(output_columns) != num_features:
raise ValueError(
"Invalid output_columns: Got num_features != len(output_columns)."
"The number of features and output_columns must match."
)
self.output_columns = output_columns

def _transform_pandas(self, df: pd.DataFrame):
# TODO(matt): Use sparse matrix for efficiency.
Expand All @@ -88,19 +99,24 @@ def row_feature_hasher(row):
for column in self.columns:
hashed_value = simple_hash(column, self.num_features)
hash_counts[hashed_value] += row[column]
return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
return {
f"hash_{output_column}": hash_counts[i]
for output_column in self.output_columns
}

feature_columns = df.loc[:, self.columns].apply(
row_feature_hasher, axis=1, result_type="expand"
)
df = df.join(feature_columns)

# Drop original unhashed columns.
df.drop(columns=self.columns, inplace=True)
# Drop original unhashed columns if output_columns is None.
if not self.output_columns:
df.drop(columns=self.columns, inplace=True)
return df

def __repr__(self):
return (
f"{self.__class__.__name__}(columns={self.columns!r}, "
f"num_features={self.num_features!r})"
f"num_features={self.num_features!r}, "
f"output_columns={self.output_columns!r})"
)
35 changes: 34 additions & 1 deletion python/ray/data/tests/preprocessors/test_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@ def test_feature_hasher():
assert document_term_matrix.iloc[1].sum() == 3
assert all(document_term_matrix.iloc[1] <= 1)

# Test append mode
with pytest.raises(ValueError):
FeatureHasher(
columns=["I", "like", "dislike", "Python"],
num_features=256,
output_columns=["B_encoded"],
)

hasher_append = FeatureHasher(
["I", "like", "dislike", "Python"],
num_features=256,
output_columns=[f"hash_{i}" for i in range(256)],
)
document_term_matrix_append = hasher_append.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()

assert document_term_matrix_append.shape == (
2,
256 + 4,
) # original columns + hashed columns


def test_hashing_vectorizer():
"""Tests basic HashingVectorizer functionality."""
Expand All @@ -40,7 +62,18 @@ def test_hashing_vectorizer():
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
ds = ray.data.from_pandas(in_df)

vectorizer = HashingVectorizer(["A", "B"], num_features=3)
vectorizer = HashingVectorizer(
["A", "B"],
num_features=3,
output_columns=[
"hash_A_0",
"hash_A_1",
"hash_A_2",
"hash_B_0",
"hash_B_1",
"hash_B_2",
],
)

transformed = vectorizer.transform(ds)
out_df = transformed.to_pandas()
Expand Down

0 comments on commit 23ff71d

Please sign in to comment.