Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[data/preprocessors] feat: allow hasher to run on append mode #50632

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions python/ray/data/preprocessors/hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor):
:class:`FeatureHasher` hashes each token to determine its index. For example,
the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.

>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
>>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
>>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy() # doctest: +SKIP
array([[0, 0, 0, 2, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 1, 1, 0]])

Expand All @@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor):
num_features: The number of features used to represent the vocabulary. You
should choose a value large enough to prevent hash collisions between
distinct tokens.
output_column: The name of the column that contains the hashed features.

.. seealso::
:class:`~ray.data.preprocessors.CountVectorizer`
Expand All @@ -75,11 +76,17 @@ class FeatureHasher(Preprocessor):

_is_fittable = False

def __init__(self, columns: List[str], num_features: int):
def __init__(
self,
columns: List[str],
num_features: int,
output_column: str,
):
self.columns = columns
# TODO(matt): Set default number of features.
# This likely requires sparse matrix support to avoid explosion of columns.
self.num_features = num_features
self.output_column = output_column

def _transform_pandas(self, df: pd.DataFrame):
# TODO(matt): Use sparse matrix for efficiency.
Expand All @@ -93,14 +100,19 @@ def row_feature_hasher(row):
feature_columns = df.loc[:, self.columns].apply(
row_feature_hasher, axis=1, result_type="expand"
)
df = df.join(feature_columns)

# Drop original unhashed columns.
df.drop(columns=self.columns, inplace=True)
# Concatenate the hash columns
hash_columns = [f"hash_{i}" for i in range(self.num_features)]
concatenated = feature_columns[hash_columns].to_numpy()
# Use a Pandas Series for column assignment to get more consistent
# behavior across Pandas versions.
df.loc[:, self.output_column] = pd.Series(list(concatenated))

return df

def __repr__(self):
return (
f"{self.__class__.__name__}(columns={self.columns!r}, "
f"num_features={self.num_features!r})"
f"num_features={self.num_features!r}, "
f"output_column={self.output_column!r})"
)
19 changes: 13 additions & 6 deletions python/ray/data/tests/preprocessors/test_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,30 @@ def test_feature_hasher():
{"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
)

hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
hasher = FeatureHasher(
["I", "like", "dislike", "Python"],
num_features=256,
output_column="hashed_features",
)
document_term_matrix = hasher.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()

hashed_features = document_term_matrix["hashed_features"]
# Document-term matrix should have shape (# documents, # features)
assert document_term_matrix.shape == (2, 256)
assert hashed_features.shape == (2,)

# The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
# for adequately large `num_features`.
assert document_term_matrix.iloc[0].sum() == 3
assert all(document_term_matrix.iloc[0] <= 1)
assert len(hashed_features.iloc[0]) == 256
assert hashed_features.iloc[0].sum() == 3
assert all(hashed_features.iloc[0] <= 1)

# The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
# indices for adequately large `num_features`.
assert document_term_matrix.iloc[1].sum() == 3
assert all(document_term_matrix.iloc[1] <= 1)
assert len(hashed_features.iloc[1]) == 256
assert hashed_features.iloc[1].sum() == 3
assert all(hashed_features.iloc[1] <= 1)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def preferred_batch_format(cls) -> BatchFormat:
Categorizer(columns=["X"]),
CountVectorizer(columns=["X"]),
Chain(StandardScaler(columns=["X"]), MinMaxScaler(columns=["X"])),
FeatureHasher(columns=["X"], num_features=1),
FeatureHasher(columns=["X"], num_features=1, output_column="X_transformed"),
HashingVectorizer(columns=["X"], num_features=1),
LabelEncoder(label_column="X"),
MaxAbsScaler(columns=["X"]),
Expand Down