Skip to content

Commit

Permalink
add output_column
Browse files Browse the repository at this point in the history
Signed-off-by: Martin Bomio <martinbomio@spotify.com>
  • Loading branch information
martinbomio committed Feb 21, 2025
1 parent 3630de5 commit 97c6a27
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 94 deletions.
34 changes: 15 additions & 19 deletions python/ray/data/preprocessors/hasher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import collections
from typing import List, Optional
from typing import List

import pandas as pd

Expand Down Expand Up @@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor):
:class:`FeatureHasher` hashes each token to determine its index. For example,
the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
>>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
>>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy() # doctest: +SKIP
array([[0, 0, 0, 2, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 1, 1, 0]])
Expand All @@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor):
num_features: The number of features used to represent the vocabulary. You
should choose a value large enough to prevent hash collisions between
distinct tokens.
output_column: The name of the column that contains the hashed features.
.. seealso::
:class:`~ray.data.preprocessors.CountVectorizer`
Expand All @@ -79,18 +80,13 @@ def __init__(
self,
columns: List[str],
num_features: int,
output_columns: Optional[List[str]] = None,
output_column: str,
):
self.columns = columns
# TODO(matt): Set default number of features.
# This likely requires sparse matrix support to avoid explosion of columns.
self.num_features = num_features
if output_columns and len(output_columns) != num_features:
raise ValueError(
"Invalid output_columns: Got num_features != len(output_columns)."
"The number of features and output_columns must match."
)
self.output_columns = output_columns
self.output_column = output_column

def _transform_pandas(self, df: pd.DataFrame):
# TODO(matt): Use sparse matrix for efficiency.
Expand All @@ -99,24 +95,24 @@ def row_feature_hasher(row):
for column in self.columns:
hashed_value = simple_hash(column, self.num_features)
hash_counts[hashed_value] += row[column]
return {
f"hash_{output_column}": hash_counts[i]
for output_column in self.output_columns
}
return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}

feature_columns = df.loc[:, self.columns].apply(
row_feature_hasher, axis=1, result_type="expand"
)
df = df.join(feature_columns)

# Drop original unhashed columns if output_columns is None.
if not self.output_columns:
df.drop(columns=self.columns, inplace=True)
# Concatenate the hash columns
hash_columns = [f"hash_{i}" for i in range(self.num_features)]
concatenated = feature_columns[hash_columns].to_numpy()
# Use a Pandas Series for column assignment to get more consistent
# behavior across Pandas versions.
df.loc[:, self.output_column] = pd.Series(list(concatenated))

return df

def __repr__(self):
return (
f"{self.__class__.__name__}(columns={self.columns!r}, "
f"num_features={self.num_features!r}, "
f"output_columns={self.output_columns!r})"
f"output_column={self.output_column!r})"
)
88 changes: 14 additions & 74 deletions python/ray/data/tests/preprocessors/test_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

import ray
from ray.data.preprocessors import FeatureHasher, HashingVectorizer
from ray.data.preprocessors import FeatureHasher


def test_feature_hasher():
Expand All @@ -13,90 +13,30 @@ def test_feature_hasher():
{"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
)

hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
hasher = FeatureHasher(
["I", "like", "dislike", "Python"],
num_features=256,
output_column="hashed_features",
)
document_term_matrix = hasher.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()

hashed_features = document_term_matrix["hashed_features"]
# Document-term matrix should have shape (# documents, # features)
assert document_term_matrix.shape == (2, 256)
assert hashed_features.shape == (2,)

# The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
# for adequately large `num_features`.
assert document_term_matrix.iloc[0].sum() == 3
assert all(document_term_matrix.iloc[0] <= 1)
assert len(hashed_features.iloc[0]) == 256
assert hashed_features.iloc[0].sum() == 3
assert all(hashed_features.iloc[0] <= 1)

# The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
# indices for adequately large `num_features`.
assert document_term_matrix.iloc[1].sum() == 3
assert all(document_term_matrix.iloc[1] <= 1)

# Test append mode
with pytest.raises(ValueError):
FeatureHasher(
columns=["I", "like", "dislike", "Python"],
num_features=256,
output_columns=["B_encoded"],
)

hasher_append = FeatureHasher(
["I", "like", "dislike", "Python"],
num_features=256,
output_columns=[f"hash_{i}" for i in range(256)],
)
document_term_matrix_append = hasher_append.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()

assert document_term_matrix_append.shape == (
2,
256 + 4,
) # original columns + hashed columns


def test_hashing_vectorizer():
"""Tests basic HashingVectorizer functionality."""

col_a = ["a b b c c c", "a a a a c"]
col_b = ["apple", "banana banana banana"]
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
ds = ray.data.from_pandas(in_df)

vectorizer = HashingVectorizer(
["A", "B"],
num_features=3,
output_columns=[
"hash_A_0",
"hash_A_1",
"hash_A_2",
"hash_B_0",
"hash_B_1",
"hash_B_2",
],
)

transformed = vectorizer.transform(ds)
out_df = transformed.to_pandas()

processed_col_a_0 = [2, 0]
processed_col_a_1 = [1, 4]
processed_col_a_2 = [3, 1]
processed_col_b_0 = [1, 0]
processed_col_b_1 = [0, 3]
processed_col_b_2 = [0, 0]

expected_df = pd.DataFrame.from_dict(
{
"hash_A_0": processed_col_a_0,
"hash_A_1": processed_col_a_1,
"hash_A_2": processed_col_a_2,
"hash_B_0": processed_col_b_0,
"hash_B_1": processed_col_b_1,
"hash_B_2": processed_col_b_2,
}
)

assert out_df.equals(expected_df)
assert len(hashed_features.iloc[1]) == 256
assert hashed_features.iloc[1].sum() == 3
assert all(hashed_features.iloc[1] <= 1)


if __name__ == "__main__":
Expand Down
47 changes: 46 additions & 1 deletion python/ray/data/tests/preprocessors/test_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

import ray
from ray.data.preprocessors import CountVectorizer
from ray.data.preprocessors import CountVectorizer, HashingVectorizer


def test_count_vectorizer():
Expand Down Expand Up @@ -82,6 +82,51 @@ def test_count_vectorizer():
assert out_df.equals(expected_df)


def test_hashing_vectorizer():
"""Tests basic HashingVectorizer functionality."""

col_a = ["a b b c c c", "a a a a c"]
col_b = ["apple", "banana banana banana"]
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
ds = ray.data.from_pandas(in_df)

vectorizer = HashingVectorizer(
["A", "B"],
num_features=3,
output_columns=[
"hash_A_0",
"hash_A_1",
"hash_A_2",
"hash_B_0",
"hash_B_1",
"hash_B_2",
],
)

transformed = vectorizer.transform(ds)
out_df = transformed.to_pandas()

processed_col_a_0 = [2, 0]
processed_col_a_1 = [1, 4]
processed_col_a_2 = [3, 1]
processed_col_b_0 = [1, 0]
processed_col_b_1 = [0, 3]
processed_col_b_2 = [0, 0]

expected_df = pd.DataFrame.from_dict(
{
"hash_A_0": processed_col_a_0,
"hash_A_1": processed_col_a_1,
"hash_A_2": processed_col_a_2,
"hash_B_0": processed_col_b_0,
"hash_B_1": processed_col_b_1,
"hash_B_2": processed_col_b_2,
}
)

assert out_df.equals(expected_df)


if __name__ == "__main__":
import sys

Expand Down

0 comments on commit 97c6a27

Please sign in to comment.