add output_column

Signed-off-by: Martin Bomio <martinbomio@spotify.com>
ray-project · Feb 21, 2025 · 97c6a27 · 97c6a27
1 parent 3630de5
commit 97c6a27
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 94 deletions.
diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py
@@ -1,5 +1,5 @@
 import collections
-from typing import List, Optional
+from typing import List
 
 import pandas as pd
 
@@ -48,8 +48,8 @@ class FeatureHasher(Preprocessor):
         :class:`FeatureHasher` hashes each token to determine its index. For example,
         the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
 
-        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
-        >>> hasher.fit_transform(ds).to_pandas().to_numpy()  # doctest: +SKIP
+        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
+        >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy()  # doctest: +SKIP
         array([[0, 0, 0, 2, 0, 1, 0, 0],
                [0, 0, 0, 1, 0, 1, 1, 0]])
 
@@ -63,6 +63,7 @@ class FeatureHasher(Preprocessor):
         num_features: The number of features used to represent the vocabulary. You
             should choose a value large enough to prevent hash collisions between
             distinct tokens.
+        output_column: The name of the column that contains the hashed features.
 
     .. seealso::
         :class:`~ray.data.preprocessors.CountVectorizer`
@@ -79,18 +80,13 @@ def __init__(
         self,
         columns: List[str],
         num_features: int,
-        output_columns: Optional[List[str]] = None,
+        output_column: str,
     ):
         self.columns = columns
         # TODO(matt): Set default number of features.
         # This likely requires sparse matrix support to avoid explosion of columns.
         self.num_features = num_features
-        if output_columns and len(output_columns) != num_features:
-            raise ValueError(
-                "Invalid output_columns: Got num_features != len(output_columns)."
-                "The number of features and output_columns must match."
-            )
-        self.output_columns = output_columns
+        self.output_column = output_column
 
     def _transform_pandas(self, df: pd.DataFrame):
         # TODO(matt): Use sparse matrix for efficiency.
@@ -99,24 +95,24 @@ def row_feature_hasher(row):
             for column in self.columns:
                 hashed_value = simple_hash(column, self.num_features)
                 hash_counts[hashed_value] += row[column]
-            return {
-                f"hash_{output_column}": hash_counts[i]
-                for output_column in self.output_columns
-            }
+            return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
 
         feature_columns = df.loc[:, self.columns].apply(
             row_feature_hasher, axis=1, result_type="expand"
         )
-        df = df.join(feature_columns)
 
-        # Drop original unhashed columns if output_columns is None.
-        if not self.output_columns:
-            df.drop(columns=self.columns, inplace=True)
+        # Concatenate the hash columns
+        hash_columns = [f"hash_{i}" for i in range(self.num_features)]
+        concatenated = feature_columns[hash_columns].to_numpy()
+        # Use a Pandas Series for column assignment to get more consistent
+        # behavior across Pandas versions.
+        df.loc[:, self.output_column] = pd.Series(list(concatenated))
+
         return df
 
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(columns={self.columns!r}, "
             f"num_features={self.num_features!r}, "
-            f"output_columns={self.output_columns!r})"
+            f"output_column={self.output_column!r})"
         )
diff --git a/python/ray/data/tests/preprocessors/test_hasher.py b/python/ray/data/tests/preprocessors/test_hasher.py
@@ -2,7 +2,7 @@
 import pytest
 
 import ray
-from ray.data.preprocessors import FeatureHasher, HashingVectorizer
+from ray.data.preprocessors import FeatureHasher
 
 
 def test_feature_hasher():
@@ -13,90 +13,30 @@ def test_feature_hasher():
         {"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
     )
 
-    hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
+    hasher = FeatureHasher(
+        ["I", "like", "dislike", "Python"],
+        num_features=256,
+        output_column="hashed_features",
+    )
     document_term_matrix = hasher.fit_transform(
         ray.data.from_pandas(token_counts)
     ).to_pandas()
 
+    hashed_features = document_term_matrix["hashed_features"]
     # Document-term matrix should have shape (# documents, # features)
-    assert document_term_matrix.shape == (2, 256)
+    assert hashed_features.shape == (2,)
 
     # The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
     # for adequately large `num_features`.
-    assert document_term_matrix.iloc[0].sum() == 3
-    assert all(document_term_matrix.iloc[0] <= 1)
+    assert len(hashed_features.iloc[0]) == 256
+    assert hashed_features.iloc[0].sum() == 3
+    assert all(hashed_features.iloc[0] <= 1)
 
     # The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
     # indices for adequately large `num_features`.
-    assert document_term_matrix.iloc[1].sum() == 3
-    assert all(document_term_matrix.iloc[1] <= 1)
-
-    # Test append mode
-    with pytest.raises(ValueError):
-        FeatureHasher(
-            columns=["I", "like", "dislike", "Python"],
-            num_features=256,
-            output_columns=["B_encoded"],
-        )
-
-    hasher_append = FeatureHasher(
-        ["I", "like", "dislike", "Python"],
-        num_features=256,
-        output_columns=[f"hash_{i}" for i in range(256)],
-    )
-    document_term_matrix_append = hasher_append.fit_transform(
-        ray.data.from_pandas(token_counts)
-    ).to_pandas()
-
-    assert document_term_matrix_append.shape == (
-        2,
-        256 + 4,
-    )  # original columns + hashed columns
-
-
-def test_hashing_vectorizer():
-    """Tests basic HashingVectorizer functionality."""
-
-    col_a = ["a b b c c c", "a a a a c"]
-    col_b = ["apple", "banana banana banana"]
-    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
-    ds = ray.data.from_pandas(in_df)
-
-    vectorizer = HashingVectorizer(
-        ["A", "B"],
-        num_features=3,
-        output_columns=[
-            "hash_A_0",
-            "hash_A_1",
-            "hash_A_2",
-            "hash_B_0",
-            "hash_B_1",
-            "hash_B_2",
-        ],
-    )
-
-    transformed = vectorizer.transform(ds)
-    out_df = transformed.to_pandas()
-
-    processed_col_a_0 = [2, 0]
-    processed_col_a_1 = [1, 4]
-    processed_col_a_2 = [3, 1]
-    processed_col_b_0 = [1, 0]
-    processed_col_b_1 = [0, 3]
-    processed_col_b_2 = [0, 0]
-
-    expected_df = pd.DataFrame.from_dict(
-        {
-            "hash_A_0": processed_col_a_0,
-            "hash_A_1": processed_col_a_1,
-            "hash_A_2": processed_col_a_2,
-            "hash_B_0": processed_col_b_0,
-            "hash_B_1": processed_col_b_1,
-            "hash_B_2": processed_col_b_2,
-        }
-    )
-
-    assert out_df.equals(expected_df)
+    assert len(hashed_features.iloc[1]) == 256
+    assert hashed_features.iloc[1].sum() == 3
+    assert all(hashed_features.iloc[1] <= 1)
 
 
 if __name__ == "__main__":

diff --git a/python/ray/data/tests/preprocessors/test_vectorizer.py b/python/ray/data/tests/preprocessors/test_vectorizer.py
@@ -4,7 +4,7 @@
 import pytest
 
 import ray
-from ray.data.preprocessors import CountVectorizer
+from ray.data.preprocessors import CountVectorizer, HashingVectorizer
 
 
 def test_count_vectorizer():
@@ -82,6 +82,51 @@ def test_count_vectorizer():
     assert out_df.equals(expected_df)
 
 
+def test_hashing_vectorizer():
+    """Tests basic HashingVectorizer functionality."""
+
+    col_a = ["a b b c c c", "a a a a c"]
+    col_b = ["apple", "banana banana banana"]
+    in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
+    ds = ray.data.from_pandas(in_df)
+
+    vectorizer = HashingVectorizer(
+        ["A", "B"],
+        num_features=3,
+        output_columns=[
+            "hash_A_0",
+            "hash_A_1",
+            "hash_A_2",
+            "hash_B_0",
+            "hash_B_1",
+            "hash_B_2",
+        ],
+    )
+
+    transformed = vectorizer.transform(ds)
+    out_df = transformed.to_pandas()
+
+    processed_col_a_0 = [2, 0]
+    processed_col_a_1 = [1, 4]
+    processed_col_a_2 = [3, 1]
+    processed_col_b_0 = [1, 0]
+    processed_col_b_1 = [0, 3]
+    processed_col_b_2 = [0, 0]
+
+    expected_df = pd.DataFrame.from_dict(
+        {
+            "hash_A_0": processed_col_a_0,
+            "hash_A_1": processed_col_a_1,
+            "hash_A_2": processed_col_a_2,
+            "hash_B_0": processed_col_b_0,
+            "hash_B_1": processed_col_b_1,
+            "hash_B_2": processed_col_b_2,
+        }
+    )
+
+    assert out_df.equals(expected_df)
+
+
 if __name__ == "__main__":
     import sys