Skip to content

Commit

Permalink
[data/preprocessors] feat: allow hasher to run on append mode (ray-pr…
Browse files Browse the repository at this point in the history
…oject#50632)

<!-- Thank you for your contribution! Please review
https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before
opening a pull request. -->

<!-- Please add a reviewer to the assignee section when you create a PR.
If you don't have the access to it, we will shortly find a reviewer and
assign them to your PR. -->

## Why are these changes needed?

This is part of ray-project#48133.
Continuing the approach taken in
ray-project#49426, make all the hashers work
in append mode

## Related issue number

ray-project#49426

## Checks

- [x] I've signed off every commit(by using the -s flag, i.e., `git
commit -s`) in this PR.
- [ ] I've run `scripts/format.sh` to lint the changes in this PR.
- [x] I've included any doc changes needed for
https://docs.ray.io/en/master/.
- [x] I've added any new APIs to the API Reference. For example, if I
added a
method in Tune, I've added it in `doc/source/tune/api/` under the
           corresponding `.rst` file.
- [x] I've made sure the tests are passing. Note that there might be a
few flaky tests, see the recent failures at https://flakey-tests.ray.io/
- Testing Strategy
   - [x] Unit tests
   - [ ] Release tests
   - [ ] This PR is not tested :(

Signed-off-by: Martin Bomio <martinbomio@spotify.com>
  • Loading branch information
martinbomio authored and Michaelhess17 committed Mar 3, 2025
1 parent 09c2434 commit b482532
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 14 deletions.
26 changes: 19 additions & 7 deletions python/ray/data/preprocessors/hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor):
:class:`FeatureHasher` hashes each token to determine its index. For example,
the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
>>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
>>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy() # doctest: +SKIP
array([[0, 0, 0, 2, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 1, 1, 0]])
Expand All @@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor):
num_features: The number of features used to represent the vocabulary. You
should choose a value large enough to prevent hash collisions between
distinct tokens.
output_column: The name of the column that contains the hashed features.
.. seealso::
:class:`~ray.data.preprocessors.CountVectorizer`
Expand All @@ -75,11 +76,17 @@ class FeatureHasher(Preprocessor):

_is_fittable = False

def __init__(self, columns: List[str], num_features: int):
def __init__(
self,
columns: List[str],
num_features: int,
output_column: str,
):
self.columns = columns
# TODO(matt): Set default number of features.
# This likely requires sparse matrix support to avoid explosion of columns.
self.num_features = num_features
self.output_column = output_column

def _transform_pandas(self, df: pd.DataFrame):
# TODO(matt): Use sparse matrix for efficiency.
Expand All @@ -93,14 +100,19 @@ def row_feature_hasher(row):
feature_columns = df.loc[:, self.columns].apply(
row_feature_hasher, axis=1, result_type="expand"
)
df = df.join(feature_columns)

# Drop original unhashed columns.
df.drop(columns=self.columns, inplace=True)
# Concatenate the hash columns
hash_columns = [f"hash_{i}" for i in range(self.num_features)]
concatenated = feature_columns[hash_columns].to_numpy()
# Use a Pandas Series for column assignment to get more consistent
# behavior across Pandas versions.
df.loc[:, self.output_column] = pd.Series(list(concatenated))

return df

def __repr__(self):
return (
f"{self.__class__.__name__}(columns={self.columns!r}, "
f"num_features={self.num_features!r})"
f"num_features={self.num_features!r}, "
f"output_column={self.output_column!r})"
)
19 changes: 13 additions & 6 deletions python/ray/data/tests/preprocessors/test_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,30 @@ def test_feature_hasher():
{"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
)

hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
hasher = FeatureHasher(
["I", "like", "dislike", "Python"],
num_features=256,
output_column="hashed_features",
)
document_term_matrix = hasher.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()

hashed_features = document_term_matrix["hashed_features"]
# Document-term matrix should have shape (# documents, # features)
assert document_term_matrix.shape == (2, 256)
assert hashed_features.shape == (2,)

# The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
# for adequately large `num_features`.
assert document_term_matrix.iloc[0].sum() == 3
assert all(document_term_matrix.iloc[0] <= 1)
assert len(hashed_features.iloc[0]) == 256
assert hashed_features.iloc[0].sum() == 3
assert all(hashed_features.iloc[0] <= 1)

# The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
# indices for adequately large `num_features`.
assert document_term_matrix.iloc[1].sum() == 3
assert all(document_term_matrix.iloc[1] <= 1)
assert len(hashed_features.iloc[1]) == 256
assert hashed_features.iloc[1].sum() == 3
assert all(hashed_features.iloc[1] <= 1)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion python/ray/data/tests/preprocessors/test_preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def preferred_batch_format(cls) -> BatchFormat:
Categorizer(columns=["X"]),
CountVectorizer(columns=["X"]),
Chain(StandardScaler(columns=["X"]), MinMaxScaler(columns=["X"])),
FeatureHasher(columns=["X"], num_features=1),
FeatureHasher(columns=["X"], num_features=1, output_column="X_transformed"),
HashingVectorizer(columns=["X"], num_features=1),
LabelEncoder(label_column="X"),
MaxAbsScaler(columns=["X"]),
Expand Down

0 comments on commit b482532

Please sign in to comment.