diff --git a/py-polars/polars/utils/udfs.py b/py-polars/polars/utils/udfs.py index 699761321b05..da10edc35b53 100644 --- a/py-polars/polars/utils/udfs.py +++ b/py-polars/polars/utils/udfs.py @@ -86,6 +86,7 @@ class StackValue(NamedTuple): "lower": "str.to_lowercase", "title": "str.to_titlecase", "upper": "str.to_uppercase", + "loads": "str.json_extract", } @@ -370,13 +371,14 @@ class RewrittenInstructions: from the identification of expression translation opportunities. """ + _ignored_ops = frozenset(["COPY_FREE_VARS", "PRECALL", "RESUME", "RETURN_VALUE"]) + def __init__(self, instructions: Iterator[Instruction]): self._instructions = instructions self._rewritten_instructions = self._apply_rules( self._upgrade_instruction(inst) for inst in instructions - if inst.opname - not in ("COPY_FREE_VARS", "PRECALL", "RESUME", "RETURN_VALUE") + if inst.opname not in self._ignored_ops ) def __len__(self) -> int: @@ -403,28 +405,34 @@ def _apply_rules(self, instructions: Iterator[Instruction]) -> list[Instruction] apply_rewrite(inst, updated_instructions) for apply_rewrite in ( # add any other rewrite methods here - self._numpy_functions, - self._python_functions, + self._rewrite_functions, + self._rewrite_methods, ) ): updated_instructions.append(inst) return updated_instructions - def _numpy_functions( + def _rewrite_functions( self, inst: Instruction, instructions: list[Instruction] ) -> bool: - """Replace numpy function calls with a synthetic POLARS_EXPRESSION op.""" - if inst.opname == "LOAD_GLOBAL" and inst.argval in _NUMPY_MODULE_ALIASES: + """Replace numpy/json function calls with a synthetic POLARS_EXPRESSION op.""" + if inst.opname == "LOAD_GLOBAL" and ( + inst.argval in _NUMPY_MODULE_ALIASES or inst.argval == "json" + ): instruction_buffer = list(islice(self._instructions, 3)) if ( len(instruction_buffer) == 3 - and instruction_buffer[0].argval in _NUMPY_FUNCTIONS + and ( + instruction_buffer[0].argval == "loads" + or instruction_buffer[0].argval in _NUMPY_FUNCTIONS + ) and instruction_buffer[1].opname.startswith("LOAD_") and instruction_buffer[2].opname.startswith("CALL") ): # note: synthetic POLARS_EXPRESSION is mapped as a unary # op, so we switch the instruction order on injection expr_name = instruction_buffer[0].argval + expr_name = _PYFUNCTION_MAP.get(expr_name, expr_name) offsets = inst.offset, instruction_buffer[1].offset synthetic_call = inst._replace( opname="POLARS_EXPRESSION", @@ -440,7 +448,7 @@ def _numpy_functions( return True return False - def _python_functions( + def _rewrite_methods( self, inst: Instruction, instructions: list[Instruction] ) -> bool: """Replace python method calls with synthetic POLARS_EXPRESSION op.""" @@ -472,15 +480,23 @@ def _upgrade_instruction(inst: Instruction) -> Instruction: return inst -def _is_raw_numpy_function(function: Callable[[Any], Any]) -> bool: - """Identify numpy calls that are not wrapped in a lambda/function.""" +def _is_raw_function(function: Callable[[Any], Any]) -> tuple[str, str]: + """Identify translatable calls that aren't wrapped inside a lambda/function.""" try: - return ( - function.__class__.__module__ == "numpy" - and function.__name__ in _NUMPY_FUNCTIONS - ) + func_module = function.__class__.__module__ + func_name = function.__name__ + if func_module == "numpy" and func_name in _NUMPY_FUNCTIONS: + return "np", func_name + elif func_module == "builtins" and func_name == "loads": + import json # double-check since it is referenced via 'builtins' + + if function is json.loads: + return "json", "str.json_extract" + except AttributeError: - return False + pass + + return "", "" def warn_on_inefficient_apply( @@ -512,10 +528,16 @@ def warn_on_inefficient_apply( parser = BytecodeParser(function, apply_target) if parser.can_rewrite(): parser.warn(col) - elif _is_raw_numpy_function(function): - fn = function.__name__ - suggestion = f'pl.col("{col}").{fn}()' - parser.warn(col, suggestion_override=suggestion, func_name_override=f"np.{fn}") + else: + # handle bare numpy/json functions + module, func_name = _is_raw_function(function) + if module and func_name: + fn = function.__name__ + parser.warn( + col, + suggestion_override=f'pl.col("{col}").{func_name}()', + func_name_override=f"{module}.{fn}", + ) __all__ = [ diff --git a/py-polars/tests/unit/operations/test_apply.py b/py-polars/tests/unit/operations/test_apply.py index 08ba6186309f..840a2754830e 100644 --- a/py-polars/tests/unit/operations/test_apply.py +++ b/py-polars/tests/unit/operations/test_apply.py @@ -172,18 +172,28 @@ def test_datelike_identity() -> None: def test_apply_list_anyvalue_fallback() -> None: import json - df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']}) - assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { - "text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]] - } + with pytest.warns( + PolarsInefficientApplyWarning, + match=r'(?s)replace your `apply` with.*pl.col\("text"\).str.json_extract()', + ): + df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']}) + assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { + "text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]] + } - # starts with empty list '[]' - df = pl.DataFrame( - {"text": ["[]", '[{"x": 1, "y": 2}, {"x": 3, "y": 4}]', '[{"x": 1, "y": 2}]']} - ) - assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { - "text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]] - } + # starts with empty list '[]' + df = pl.DataFrame( + { + "text": [ + "[]", + '[{"x": 1, "y": 2}, {"x": 3, "y": 4}]', + '[{"x": 1, "y": 2}]', + ] + } + ) + assert df.select(pl.col("text").apply(json.loads)).to_dict(False) == { + "text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]] + } def test_apply_all_types() -> None: @@ -246,7 +256,8 @@ def test_apply_skip_nulls() -> None: def test_apply_object_dtypes() -> None: with pytest.warns( - PolarsInefficientApplyWarning, match="In this case, you can replace" + PolarsInefficientApplyWarning, + match=r"(?s)replace your `apply` with.*lambda x:", ): assert pl.DataFrame( {"a": pl.Series([1, 2, "a", 4, 5], dtype=pl.Object)} @@ -283,15 +294,19 @@ def test_apply_explicit_list_output_type() -> None: def test_apply_dict() -> None: - df = pl.DataFrame({"Col": ['{"A":"Value1"}', '{"B":"Value2"}']}) - assert df.select(pl.col("Col").apply(json.loads)).to_dict(False) == { - "Col": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}] - } - assert pl.DataFrame( - {"Col": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']} - ).select(pl.col("Col").apply(json.loads)).to_dict(False) == { - "Col": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}] - } + with pytest.warns( + PolarsInefficientApplyWarning, + match=r'(?s)replace your `apply` with.*pl.col\("abc"\).str.json_extract()', + ): + df = pl.DataFrame({"abc": ['{"A":"Value1"}', '{"B":"Value2"}']}) + assert df.select(pl.col("abc").apply(json.loads)).to_dict(False) == { + "abc": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}] + } + assert pl.DataFrame( + {"abc": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']} + ).select(pl.col("abc").apply(json.loads)).to_dict(False) == { + "abc": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}] + } def test_apply_pass_name() -> None: diff --git a/py-polars/tests/unit/operations/test_inefficient_apply.py b/py-polars/tests/unit/operations/test_inefficient_apply.py index 37a432ecac8a..8aa07242c9ab 100644 --- a/py-polars/tests/unit/operations/test_inefficient_apply.py +++ b/py-polars/tests/unit/operations/test_inefficient_apply.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from typing import Any, Callable import numpy @@ -31,7 +32,7 @@ def test_parse_invalid_function(func: Callable[[Any], Any]) -> None: @pytest.mark.parametrize( ("col", "func"), [ - # numeric cols: math, comparison, logic ops + # numeric col: math, comparison, logic ops ("a", lambda x: x + 1 - (2 / 3)), ("a", lambda x: x // 1 % 2), ("a", lambda x: x & True), @@ -51,9 +52,11 @@ def test_parse_invalid_function(func: Callable[[Any], Any]) -> None: ("a", lambda x: MY_CONSTANT + x), ("a", lambda x: 0 + numpy.cbrt(x)), ("a", lambda x: np.sin(x) + 1), - # string cols + # string col: case ops ("b", lambda x: x.title()), ("b", lambda x: x.lower() + ":" + x.upper()), + # json col: load/extract + ("c", lambda x: json.loads(x)), ], ) def test_parse_apply_functions(col: str, func: Callable[[Any], Any]) -> None: @@ -68,6 +71,7 @@ def test_parse_apply_functions(col: str, func: Callable[[Any], Any]) -> None: { "a": [1, 2, 3], "b": ["AB", "cd", "eF"], + "c": ['{"a": 1}', '{"b": 2}', '{"c": 3}'], } ) result = df.select( @@ -81,9 +85,10 @@ def test_parse_apply_functions(col: str, func: Callable[[Any], Any]) -> None: assert_frame_equal(result, expected) -def test_parse_apply_numpy_raw() -> None: +def test_parse_apply_raw_functions() -> None: lf = pl.LazyFrame({"a": [1, 2, 3]}) + # test bare numpy functions for func_name in _NUMPY_FUNCTIONS: func = getattr(numpy, func_name) @@ -93,12 +98,31 @@ def test_parse_apply_numpy_raw() -> None: # ...but we ARE still able to warn with pytest.warns( - PolarsInefficientApplyWarning, match="In this case, you can replace" + PolarsInefficientApplyWarning, + match=rf"(?s)In this case, you can replace.*np\.{func_name}", ): df1 = lf.select(pl.col("a").apply(func)).collect() df2 = lf.select(getattr(pl.col("a"), func_name)()).collect() assert_frame_equal(df1, df2) + # test bare json.loads + result_frames = [] + with pytest.warns( + PolarsInefficientApplyWarning, + match=r"(?s)In this case, you can replace.*\.str\.json_extract", + ): + for expr in ( + pl.col("value").str.json_extract(), + pl.col("value").apply(json.loads), + ): + result_frames.append( + pl.LazyFrame({"value": ['{"a":1, "b": true, "c": "xx"}', None]}) + .select(extracted=expr) + .unnest("extracted") + .collect() + ) + assert_frame_equal(*result_frames) + def test_parse_apply_miscellaneous() -> None: # note: can also identify inefficient functions and methods as well as lambdas