pola-rs · ritchie46 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst
@@ -91,6 +91,7 @@ JSON
 .. autosummary::
    :toctree: api/
 
+   json_normalize
    read_json
    read_ndjson
    scan_ndjson

@@ -182,6 +182,7 @@
     zeros,
 )
 from polars.io import (
+    json_normalize,
     read_avro,
     read_clipboard,
     read_csv,
@@ -302,6 +303,7 @@
     # polars.type_aliases
     "PolarsDataType",
     # polars.io
+    "json_normalize",
     "read_avro",
     "read_clipboard",
     "read_csv",

diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py
@@ -7,13 +7,14 @@
 from polars.io.delta import read_delta, scan_delta
 from polars.io.iceberg import scan_iceberg
 from polars.io.ipc import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc
-from polars.io.json import read_json
+from polars.io.json import json_normalize, read_json
 from polars.io.ndjson import read_ndjson, scan_ndjson
 from polars.io.parquet import read_parquet, read_parquet_schema, scan_parquet
 from polars.io.pyarrow_dataset import scan_pyarrow_dataset
 from polars.io.spreadsheet import read_excel, read_ods
 
 __all__ = [
+    "json_normalize",
     "read_avro",
     "read_clipboard",
     "read_csv",

@@ -0,0 +1,4 @@
+from polars.io.json.normalize import json_normalize
+from polars.io.json.read import read_json
+
+__all__ = ["read_json", "json_normalize"]
diff --git a/py-polars/polars/io/json/normalize.py b/py-polars/polars/io/json/normalize.py
@@ -0,0 +1,212 @@
+# This code is partially forked and adapted from pandas.
+# Some parts are distributed under: https://github.com/pandas-dev/pandas/blob/main/LICENSE
+from __future__ import annotations
+
+import json
+from collections import abc
+from typing import TYPE_CHECKING, Any, Sequence
+
+from polars.dataframe import DataFrame
+from polars.datatypes.constants import N_INFER_DEFAULT
+
+if TYPE_CHECKING:
+    from polars.schema import Schema
+
+
+def _simple_json_normalize(
+    data: dict[Any, Any] | Sequence[dict[Any, Any] | Any],
+    separator: str,
+    max_level: int,
+) -> dict[Any, Any] | list[dict[Any, Any]] | Any:
+    if max_level > 0:
+        normalized_json_object = {}
+        # expect a dictionary, as most jsons are. However, lists are perfectly valid
+        if isinstance(data, dict):
+            normalized_json_object = _normalize_json_ordered(
+                data=data, separator=separator, max_level=max_level
+            )
+        elif isinstance(data, list):
+            normalised_json_list = [
+                _simple_json_normalize(row, separator=separator, max_level=max_level)
+                for row in data
+            ]
+            return normalised_json_list
+        return normalized_json_object
+    else:
+        return data
+
+
+def _normalize_json_ordered(
+    data: dict[str, Any], separator: str, max_level: int
+) -> dict[str, Any]:
+    """
+    Order the top level keys and then recursively go to depth.
+
+    Parameters
+    ----------
+    data
+        dict or list of dicts
+    separator
+        str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+    max_level
+        max recursing level
+
+    Returns
+    -------
+    dict or list of dicts, matching `normalised_json_object`
+    """
+    top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
+    nested_dict_ = normalize_json(
+        data={k: v for k, v in data.items() if isinstance(v, dict)},
+        key_string="",
+        normalized_dict={},
+        separator=separator,
+        max_level=max_level,
+    )
+    return {**top_dict_, **nested_dict_}
+
+
+def json_normalize(
+    data: dict[Any, Any] | Sequence[dict[Any, Any] | Any],
+    *,
+    separator: str = ".",
+    max_level: int | None = None,
+    schema: Schema | None = None,
+    strict: bool = True,
+    infer_schema_length: int | None = N_INFER_DEFAULT,
+) -> DataFrame:
+    """
+    Normalize semi-structured deserialized JSON data into a flat table.
+
+    Dictionary objects that will not be unnested/normalized are encoded
+    as json string data. Unlike it pandas' counterpart, this function will
+    not encode dictionaries as objects at any level.
+
+    Parameters
+    ----------
+    data
+        Deserialized JSON objects.
+    separator
+        Nested records will generate names separated by sep. e.g.,
+        for `separator=".", {"foo": {"bar": 0}}` -> foo.bar.
+    max_level
+        Max number of levels(depth of dict) to normalize.
+        If None, normalizes all levels.
+    schema
+        Overwrite the `Schema` when the normalized data is passed to
+        the `DataFrame` constructor.
+    strict
+        Whether Polars should be strict when constructing the DataFrame.
+    infer_schema_length
+        Number of rows to take into consideration to determine the schema.
+
+    Examples
+    --------
+    >>> data = [{
+    ...     "id": 1,
+    ...     "name": "Cole Volk",
+    ...     "fitness": {"height": 130, "weight": 60},
+    ... },
+
+    ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+
+    ... {
+    ...     "id": 2,
+    ...     "name": "Faye Raker",
+    ...     "fitness": {"height": 130, "weight": 60},
+    ... },
+    ... ]
+    >>> pl.json_normalize(data, max_level=1)
+    shape: (3, 4)
+    ┌──────┬────────────┬────────────────┬────────────────┐
+    │ id   ┆ name       ┆ fitness.height ┆ fitness.weight │
+    │ ---  ┆ ---        ┆ ---            ┆ ---            │
+    │ i64  ┆ str        ┆ i64            ┆ i64            │
+    ╞══════╪════════════╪════════════════╪════════════════╡
+    │ 1    ┆ Cole Volk  ┆ 130            ┆ 60             │
+    │ null ┆ Mark Reg   ┆ 130            ┆ 60             │
+    │ 2    ┆ Faye Raker ┆ 130            ┆ 60             │
+    └──────┴────────────┴────────────────┴────────────────┘
+    >>> pl.json_normalize(data, max_level=0)
+    shape: (3, 3)
+    ┌──────┬────────────┬───────────────────────────────┐
+    │ id   ┆ name       ┆ fitness                       │
+    │ ---  ┆ ---        ┆ ---                           │
+    │ i64  ┆ str        ┆ str                           │
+    ╞══════╪════════════╪═══════════════════════════════╡
+    │ 1    ┆ Cole Volk  ┆ {"height": 130, "weight": 60} │
+    │ null ┆ Mark Reg   ┆ {"height": 130, "weight": 60} │
+    │ 2    ┆ Faye Raker ┆ {"height": 130, "weight": 60} │
+    └──────┴────────────┴───────────────────────────────┘
+
+    """
+    if max_level is None:
+        max_level = 1 << 32
+    max_level += 1
+    if isinstance(data, list) and len(data) == 0:
+        return DataFrame()
+    elif isinstance(data, dict):
+        data = [data]
+    elif isinstance(data, abc.Iterable) and not isinstance(data, str):  # type: ignore[redundant-expr]
+        data = list(data)
+    else:
+        msg = "expected list of objects"
+        raise ValueError(msg)
+    return DataFrame(
+        _simple_json_normalize(data, separator=separator, max_level=max_level),
+        schema=schema,
+        strict=strict,
+        infer_schema_length=infer_schema_length,
+    )
+
+
+def normalize_json(
+    data: Any,
+    key_string: str,
+    normalized_dict: dict[str, Any],
+    separator: str,
+    max_level: int,
+) -> dict[str, Any]:
+    """
+    Main recursive function.
+
+    Designed for the most basic use case of pl.json_normalize(data)
+    intended as a performance improvement.
+
+    Parameters
+    ----------
+    data : Any
+        Type dependent on types contained within nested Json
+    key_string : str
+        New key (with separator(s) in) for data
+    normalized_dict : dict
+        The new normalized/flattened Json dict
+    separator : str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+    max_level
+        recursion depth
+    """
+    if isinstance(data, dict):
+        if max_level > 0:
+            for key, value in data.items():
+                new_key = f"{key_string}{separator}{key}"
+
+                if not key_string:
+                    new_key = new_key.removeprefix(separator)
+
+                normalize_json(
+                    data=value,
+                    key_string=new_key,
+                    normalized_dict=normalized_dict,
+                    separator=separator,
+                    max_level=max_level - 1,
+                )
+        else:
+            normalized_dict[key_string] = json.dumps(data)
+            return normalized_dict
+    else:
+        normalized_dict[key_string] = data
+    return normalized_dict
@@ -308,3 +308,58 @@ def test_json_wrong_input_handle_textio(tmp_path: Path) -> None:
     df.write_ndjson(file_path)
     with open(file_path) as f:  # noqa: PTH123
         assert_frame_equal(pl.read_ndjson(f), df)
+
+
+def test_json_normalize() -> None:
+    data = [
+        {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+        {"name": {"given": "Mark", "family": "Regner"}},
+        {"id": 2, "name": "Faye Raker"},
+    ]
+
+    assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
+        "id": [1, None, 2],
+        "name": [
+            '{"first": "Coleen", "last": "Volk"}',
+            '{"given": "Mark", "family": "Regner"}',
+            "Faye Raker",
+        ],
+    }
+
+    assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
+        "id": [1, None, 2],
+        "name.first": ["Coleen", None, None],
+        "name.last": ["Volk", None, None],
+        "name.given": [None, "Mark", None],
+        "name.family": [None, "Regner", None],
+        "name": [None, None, "Faye Raker"],
+    }
+
+    data = [
+        {
+            "id": 1,
+            "name": "Cole Volk",
+            "fitness": {"height": 130, "weight": 60},
+        },
+        {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+        {
+            "id": 2,
+            "name": "Faye Raker",
+            "fitness": {"height": 130, "weight": 60},
+        },
+    ]
+    assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
+        "id": [1, None, 2],
+        "name": ["Cole Volk", "Mark Reg", "Faye Raker"],
+        "fitness": [
+            '{"height": 130, "weight": 60}',
+            '{"height": 130, "weight": 60}',
+            '{"height": 130, "weight": 60}',
+        ],
+    }
+    assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
+        "id": [1, None, 2],
+        "name": ["Cole Volk", "Mark Reg", "Faye Raker"],
+        "fitness.height": [130, 130, 130],
+        "fitness.weight": [60, 60, 60],
+    }