Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Add simple version of json_normalize #17015

Merged
merged 4 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ JSON
.. autosummary::
:toctree: api/

json_normalize
read_json
read_ndjson
scan_ndjson
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@
zeros,
)
from polars.io import (
json_normalize,
read_avro,
read_clipboard,
read_csv,
Expand Down Expand Up @@ -302,6 +303,7 @@
# polars.type_aliases
"PolarsDataType",
# polars.io
"json_normalize",
"read_avro",
"read_clipboard",
"read_csv",
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
from polars.io.delta import read_delta, scan_delta
from polars.io.iceberg import scan_iceberg
from polars.io.ipc import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc
from polars.io.json import read_json
from polars.io.json import json_normalize, read_json
from polars.io.ndjson import read_ndjson, scan_ndjson
from polars.io.parquet import read_parquet, read_parquet_schema, scan_parquet
from polars.io.pyarrow_dataset import scan_pyarrow_dataset
from polars.io.spreadsheet import read_excel, read_ods

__all__ = [
"json_normalize",
"read_avro",
"read_clipboard",
"read_csv",
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/io/json/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from polars.io.json.normalize import json_normalize
from polars.io.json.read import read_json

__all__ = ["read_json", "json_normalize"]
212 changes: 212 additions & 0 deletions py-polars/polars/io/json/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# This code is partially forked and adapted from pandas.
# Some parts are distributed under: https://github.com/pandas-dev/pandas/blob/main/LICENSE
from __future__ import annotations

import json
from collections import abc
from typing import TYPE_CHECKING, Any, Sequence

from polars.dataframe import DataFrame
from polars.datatypes.constants import N_INFER_DEFAULT

if TYPE_CHECKING:
from polars.schema import Schema


def _simple_json_normalize(
data: dict[Any, Any] | Sequence[dict[Any, Any] | Any],
separator: str,
max_level: int,
) -> dict[Any, Any] | list[dict[Any, Any]] | Any:
if max_level > 0:
normalized_json_object = {}
# expect a dictionary, as most jsons are. However, lists are perfectly valid
if isinstance(data, dict):
normalized_json_object = _normalize_json_ordered(
data=data, separator=separator, max_level=max_level
)
elif isinstance(data, list):
normalised_json_list = [
_simple_json_normalize(row, separator=separator, max_level=max_level)
for row in data
]
return normalised_json_list
return normalized_json_object
else:
return data


def _normalize_json_ordered(
data: dict[str, Any], separator: str, max_level: int
) -> dict[str, Any]:
"""
Order the top level keys and then recursively go to depth.

Parameters
----------
data
dict or list of dicts
separator
str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
max_level
max recursing level

Returns
-------
dict or list of dicts, matching `normalised_json_object`
"""
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
nested_dict_ = normalize_json(
data={k: v for k, v in data.items() if isinstance(v, dict)},
key_string="",
normalized_dict={},
separator=separator,
max_level=max_level,
)
return {**top_dict_, **nested_dict_}


def json_normalize(
data: dict[Any, Any] | Sequence[dict[Any, Any] | Any],
*,
separator: str = ".",
max_level: int | None = None,
schema: Schema | None = None,
strict: bool = True,
infer_schema_length: int | None = N_INFER_DEFAULT,
) -> DataFrame:
"""
Normalize semi-structured deserialized JSON data into a flat table.

Dictionary objects that will not be unnested/normalized are encoded
as json string data. Unlike it pandas' counterpart, this function will
not encode dictionaries as objects at any level.

Parameters
----------
data
Deserialized JSON objects.
separator
Nested records will generate names separated by sep. e.g.,
for `separator=".", {"foo": {"bar": 0}}` -> foo.bar.
max_level
Max number of levels(depth of dict) to normalize.
If None, normalizes all levels.
schema
Overwrite the `Schema` when the normalized data is passed to
the `DataFrame` constructor.
strict
Whether Polars should be strict when constructing the DataFrame.
infer_schema_length
Number of rows to take into consideration to determine the schema.

Examples
--------
>>> data = [{
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },

... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> pl.json_normalize(data, max_level=1)
shape: (3, 4)
┌──────┬────────────┬────────────────┬────────────────┐
│ id ┆ name ┆ fitness.height ┆ fitness.weight │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 │
╞══════╪════════════╪════════════════╪════════════════╡
│ 1 ┆ Cole Volk ┆ 130 ┆ 60 │
│ null ┆ Mark Reg ┆ 130 ┆ 60 │
│ 2 ┆ Faye Raker ┆ 130 ┆ 60 │
└──────┴────────────┴────────────────┴────────────────┘
>>> pl.json_normalize(data, max_level=0)
shape: (3, 3)
┌──────┬────────────┬───────────────────────────────┐
│ id ┆ name ┆ fitness │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞══════╪════════════╪═══════════════════════════════╡
│ 1 ┆ Cole Volk ┆ {"height": 130, "weight": 60} │
│ null ┆ Mark Reg ┆ {"height": 130, "weight": 60} │
│ 2 ┆ Faye Raker ┆ {"height": 130, "weight": 60} │
└──────┴────────────┴───────────────────────────────┘

"""
if max_level is None:
max_level = 1 << 32
max_level += 1
if isinstance(data, list) and len(data) == 0:
return DataFrame()
elif isinstance(data, dict):
data = [data]
elif isinstance(data, abc.Iterable) and not isinstance(data, str): # type: ignore[redundant-expr]
data = list(data)
else:
msg = "expected list of objects"
raise ValueError(msg)
return DataFrame(
_simple_json_normalize(data, separator=separator, max_level=max_level),
schema=schema,
strict=strict,
infer_schema_length=infer_schema_length,
)


def normalize_json(
data: Any,
key_string: str,
normalized_dict: dict[str, Any],
separator: str,
max_level: int,
) -> dict[str, Any]:
"""
Main recursive function.

Designed for the most basic use case of pl.json_normalize(data)
intended as a performance improvement.

Parameters
----------
data : Any
Type dependent on types contained within nested Json
key_string : str
New key (with separator(s) in) for data
normalized_dict : dict
The new normalized/flattened Json dict
separator : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
max_level
recursion depth
"""
if isinstance(data, dict):
if max_level > 0:
for key, value in data.items():
new_key = f"{key_string}{separator}{key}"

if not key_string:
new_key = new_key.removeprefix(separator)

normalize_json(
data=value,
key_string=new_key,
normalized_dict=normalized_dict,
separator=separator,
max_level=max_level - 1,
)
else:
normalized_dict[key_string] = json.dumps(data)
return normalized_dict
else:
normalized_dict[key_string] = data
return normalized_dict
File renamed without changes.
55 changes: 55 additions & 0 deletions py-polars/tests/unit/io/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,58 @@ def test_json_wrong_input_handle_textio(tmp_path: Path) -> None:
df.write_ndjson(file_path)
with open(file_path) as f: # noqa: PTH123
assert_frame_equal(pl.read_ndjson(f), df)


def test_json_normalize() -> None:
data = [
{"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
{"name": {"given": "Mark", "family": "Regner"}},
{"id": 2, "name": "Faye Raker"},
]

assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
"id": [1, None, 2],
"name": [
'{"first": "Coleen", "last": "Volk"}',
'{"given": "Mark", "family": "Regner"}',
"Faye Raker",
],
}

assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
"id": [1, None, 2],
"name.first": ["Coleen", None, None],
"name.last": ["Volk", None, None],
"name.given": [None, "Mark", None],
"name.family": [None, "Regner", None],
"name": [None, None, "Faye Raker"],
}

data = [
{
"id": 1,
"name": "Cole Volk",
"fitness": {"height": 130, "weight": 60},
},
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
{
"id": 2,
"name": "Faye Raker",
"fitness": {"height": 130, "weight": 60},
},
]
assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
"id": [1, None, 2],
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
"fitness": [
'{"height": 130, "weight": 60}',
'{"height": 130, "weight": 60}',
'{"height": 130, "weight": 60}',
],
}
assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
"id": [1, None, 2],
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
"fitness.height": [130, 130, 130],
"fitness.weight": [60, 60, 60],
}
Loading