From b7d11d2a89975ea37eb1f60d4bdf5b3c72710318 Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 24 Feb 2025 18:25:19 -0500 Subject: [PATCH] [data/preprocessors] feat: allow tokenizer to execute in append mode (#50848) ## Why are these changes needed? This is part of https://github.com/ray-project/ray/issues/48133. Continuing the approach taken in https://github.com/ray-project/ray/pull/49426, make tokenizer work in append mode ## Related issue number https://github.com/ray-project/ray/pull/49426 ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've included any doc changes needed for https://docs.ray.io/en/master/. - [x] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Martin Bomio --- python/ray/data/preprocessors/tokenizer.py | 21 +++++++- .../tests/preprocessors/test_tokenizer.py | 53 ++++++++++++++++++- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/python/ray/data/preprocessors/tokenizer.py b/python/ray/data/preprocessors/tokenizer.py index 03bc14185244f..8e44d3934579d 100644 --- a/python/ray/data/preprocessors/tokenizer.py +++ b/python/ray/data/preprocessors/tokenizer.py @@ -40,12 +40,25 @@ class Tokenizer(Preprocessor): 0 [Hello, world] 1 [foo, bar, baz] + :class:`Tokenizer` can also be used in append mode by providing the + name of the output_columns that should hold the tokenized values. + + >>> tokenizer = Tokenizer(columns=["text"], output_columns=["text_tokenized"]) + >>> tokenizer.transform(ds).to_pandas() # doctest: +SKIP + text text_tokenized + 0 Hello, world! [Hello,, world!] + 1 foo bar\\nbaz [foo, bar\\nbaz] + Args: columns: The columns to tokenize. tokenization_fn: The function used to generate tokens. This function should accept a string as input and return a list of tokens as output. If unspecified, the tokenizer uses a function equivalent to ``lambda s: s.split(" ")``. + output_columns: The names of the transformed columns. If None, the transformed + columns will be the same as the input columns. If not None, the length of + ``output_columns`` must match the length of ``columns``, othwerwise an error + will be raised. """ _is_fittable = False @@ -54,21 +67,25 @@ def __init__( self, columns: List[str], tokenization_fn: Optional[Callable[[str], List[str]]] = None, + output_columns: Optional[List[str]] = None, ): self.columns = columns # TODO(matt): Add a more robust default tokenizer. self.tokenization_fn = tokenization_fn or simple_split_tokenizer + self.output_columns = Preprocessor._derive_and_validate_output_columns( + columns, output_columns + ) def _transform_pandas(self, df: pd.DataFrame): def column_tokenizer(s: pd.Series): return s.map(self.tokenization_fn) - df.loc[:, self.columns] = df.loc[:, self.columns].transform(column_tokenizer) + df[self.output_columns] = df.loc[:, self.columns].transform(column_tokenizer) return df def __repr__(self): name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) return ( f"{self.__class__.__name__}(columns={self.columns!r}, " - f"tokenization_fn={name})" + f"tokenization_fn={name}, output_columns={self.output_columns!r})" ) diff --git a/python/ray/data/tests/preprocessors/test_tokenizer.py b/python/ray/data/tests/preprocessors/test_tokenizer.py index 8a6af360588b5..8adc3cef5d965 100644 --- a/python/ray/data/tests/preprocessors/test_tokenizer.py +++ b/python/ray/data/tests/preprocessors/test_tokenizer.py @@ -24,7 +24,58 @@ def test_tokenizer(): ] expected_df = pd.DataFrame.from_dict({"A": processed_col_a, "B": processed_col_b}) - assert out_df.equals(expected_df) + pd.testing.assert_frame_equal(out_df, expected_df, check_like=True) + + # Test append mode + with pytest.raises( + ValueError, match="The length of columns and output_columns must match." + ): + Tokenizer(columns=["A", "B"], output_columns=["A_tokenized"]) + + tokenizer = Tokenizer( + columns=["A", "B"], output_columns=["A_tokenized", "B_tokenized"] + ) + transformed = tokenizer.transform(ds) + out_df = transformed.to_pandas() + print(out_df) + expected_df = pd.DataFrame.from_dict( + { + "A": col_a, + "B": col_b, + "A_tokenized": processed_col_a, + "B_tokenized": processed_col_b, + } + ) + + pd.testing.assert_frame_equal(out_df, expected_df, check_like=True) + + # Test custom tokenization function + def custom_tokenizer(s: str) -> list: + return s.replace("banana", "fruit").split() + + tokenizer = Tokenizer( + columns=["A", "B"], + tokenization_fn=custom_tokenizer, + output_columns=["A_custom", "B_custom"], + ) + transformed = tokenizer.transform(ds) + out_df = transformed.to_pandas() + + custom_processed_col_a = [["this", "is", "a", "test"], ["apple"]] + custom_processed_col_b = [ + ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"], + ["fruit", "fruit"], + ] + expected_df = pd.DataFrame.from_dict( + { + "A": col_a, + "B": col_b, + "A_custom": custom_processed_col_a, + "B_custom": custom_processed_col_b, + } + ) + + pd.testing.assert_frame_equal(out_df, expected_df, check_like=True) if __name__ == "__main__":