Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Add simple version of json_normalize #17015

Merged
merged 4 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Conversion
from_pandas
from_records
from_repr
json_normalize

Miscellaneous
~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from_pandas,
from_records,
from_repr,
json_normalize,
)
from polars.dataframe import DataFrame
from polars.datatypes import (
Expand Down Expand Up @@ -428,6 +429,7 @@
"from_pandas",
"from_records",
"from_repr",
"json_normalize",
# polars.sql
"SQLContext",
"sql",
Expand Down
23 changes: 23 additions & 0 deletions py-polars/polars/convert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from polars.convert.general import (
from_arrow,
from_dataframe,
from_dict,
from_dicts,
from_numpy,
from_pandas,
from_records,
from_repr,
)
from polars.convert.normalize import json_normalize

__all__ = [
"from_arrow",
"from_dataframe",
"from_dict",
"from_dicts",
"from_numpy",
"from_pandas",
"from_records",
"from_repr",
"json_normalize",
]
File renamed without changes.
230 changes: 230 additions & 0 deletions py-polars/polars/convert/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
# This code is partially forked and adapted from pandas.
# Some parts are distributed under: https://github.com/pandas-dev/pandas/blob/main/LICENSE
from __future__ import annotations

import json
from collections import abc
from typing import TYPE_CHECKING, Any, Sequence

from polars._utils.unstable import unstable
from polars.dataframe import DataFrame
from polars.datatypes.constants import N_INFER_DEFAULT

if TYPE_CHECKING:
from polars.schema import Schema

import sys

if sys.version_info >= (3, 9):

def _remove_prefix(text: str, prefix: str) -> str:
return text.removeprefix(prefix)
else:

def _remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text


def _simple_json_normalize(
data: dict[Any, Any] | Sequence[dict[Any, Any] | Any],
separator: str,
max_level: int,
) -> dict[Any, Any] | list[dict[Any, Any]] | Any:
if max_level > 0:
normalized_json_object = {}
# expect a dictionary, as most jsons are. However, lists are perfectly valid
if isinstance(data, dict):
normalized_json_object = _normalize_json_ordered(
data=data, separator=separator, max_level=max_level
)
elif isinstance(data, list):
normalised_json_list = [
_simple_json_normalize(row, separator=separator, max_level=max_level)
for row in data
]
return normalised_json_list
return normalized_json_object
else:
return data


def _normalize_json_ordered(
data: dict[str, Any], separator: str, max_level: int
) -> dict[str, Any]:
"""
Order the top level keys and then recursively go to depth.

Parameters
----------
data
dict or list of dicts
separator
str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
max_level
max recursing level

Returns
-------
dict or list of dicts, matching `normalised_json_object`
"""
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
nested_dict_ = normalize_json(
data={k: v for k, v in data.items() if isinstance(v, dict)},
key_string="",
normalized_dict={},
separator=separator,
max_level=max_level,
)
return {**top_dict_, **nested_dict_}


@unstable()
def json_normalize(
data: dict[Any, Any] | Sequence[dict[Any, Any] | Any],
*,
separator: str = ".",
max_level: int | None = None,
schema: Schema | None = None,
strict: bool = True,
infer_schema_length: int | None = N_INFER_DEFAULT,
) -> DataFrame:
"""
Normalize semi-structured deserialized JSON data into a flat table.

Dictionary objects that will not be unnested/normalized are encoded
as json string data. Unlike it pandas' counterpart, this function will
not encode dictionaries as objects at any level.

.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.

Parameters
----------
data
Deserialized JSON objects.
separator
Nested records will generate names separated by sep. e.g.,
for `separator=".", {"foo": {"bar": 0}}` -> foo.bar.
max_level
Max number of levels(depth of dict) to normalize.
If None, normalizes all levels.
schema
Overwrite the `Schema` when the normalized data is passed to
the `DataFrame` constructor.
strict
Whether Polars should be strict when constructing the DataFrame.
infer_schema_length
Number of rows to take into consideration to determine the schema.

Examples
--------
>>> data = [
... {
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> pl.json_normalize(data, max_level=1)
shape: (3, 4)
┌──────┬────────────┬────────────────┬────────────────┐
│ id ┆ name ┆ fitness.height ┆ fitness.weight │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 │
╞══════╪════════════╪════════════════╪════════════════╡
│ 1 ┆ Cole Volk ┆ 130 ┆ 60 │
│ null ┆ Mark Reg ┆ 130 ┆ 60 │
│ 2 ┆ Faye Raker ┆ 130 ┆ 60 │
└──────┴────────────┴────────────────┴────────────────┘
>>> pl.json_normalize(data, max_level=0)
shape: (3, 3)
┌──────┬────────────┬───────────────────────────────┐
│ id ┆ name ┆ fitness │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞══════╪════════════╪═══════════════════════════════╡
│ 1 ┆ Cole Volk ┆ {"height": 130, "weight": 60} │
│ null ┆ Mark Reg ┆ {"height": 130, "weight": 60} │
│ 2 ┆ Faye Raker ┆ {"height": 130, "weight": 60} │
└──────┴────────────┴───────────────────────────────┘

"""
if max_level is None:
max_level = 1 << 32
max_level += 1
if isinstance(data, list) and len(data) == 0:
return DataFrame()
elif isinstance(data, dict):
data = [data]
elif isinstance(data, abc.Iterable) and not isinstance(data, str): # type: ignore[redundant-expr]
data = list(data)
else:
msg = "expected list of objects"
raise ValueError(msg)
return DataFrame(
_simple_json_normalize(data, separator=separator, max_level=max_level),
schema=schema,
strict=strict,
infer_schema_length=infer_schema_length,
)


def normalize_json(
data: Any,
key_string: str,
normalized_dict: dict[str, Any],
separator: str,
max_level: int,
) -> dict[str, Any]:
"""
Main recursive function.

Designed for the most basic use case of pl.json_normalize(data)
intended as a performance improvement.

Parameters
----------
data : Any
Type dependent on types contained within nested Json
key_string : str
New key (with separator(s) in) for data
normalized_dict : dict
The new normalized/flattened Json dict
separator : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
max_level
recursion depth
"""
if isinstance(data, dict):
if max_level > 0:
for key, value in data.items():
new_key = f"{key_string}{separator}{key}"

if not key_string:
new_key = _remove_prefix(new_key, separator)

normalize_json(
data=value,
key_string=new_key,
normalized_dict=normalized_dict,
separator=separator,
max_level=max_level - 1,
)
else:
normalized_dict[key_string] = json.dumps(data)
return normalized_dict
else:
normalized_dict[key_string] = data
return normalized_dict
3 changes: 3 additions & 0 deletions py-polars/polars/io/json/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from polars.io.json.read import read_json

__all__ = ["read_json"]
File renamed without changes.
55 changes: 55 additions & 0 deletions py-polars/tests/unit/io/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,58 @@ def test_json_wrong_input_handle_textio(tmp_path: Path) -> None:
df.write_ndjson(file_path)
with open(file_path) as f: # noqa: PTH123
assert_frame_equal(pl.read_ndjson(f), df)


def test_json_normalize() -> None:
data = [
{"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
{"name": {"given": "Mark", "family": "Regner"}},
{"id": 2, "name": "Faye Raker"},
]

assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
"id": [1, None, 2],
"name": [
'{"first": "Coleen", "last": "Volk"}',
'{"given": "Mark", "family": "Regner"}',
"Faye Raker",
],
}

assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
"id": [1, None, 2],
"name.first": ["Coleen", None, None],
"name.last": ["Volk", None, None],
"name.given": [None, "Mark", None],
"name.family": [None, "Regner", None],
"name": [None, None, "Faye Raker"],
}

data = [
{
"id": 1,
"name": "Cole Volk",
"fitness": {"height": 130, "weight": 60},
},
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
{
"id": 2,
"name": "Faye Raker",
"fitness": {"height": 130, "weight": 60},
},
]
assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
"id": [1, None, 2],
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
"fitness": [
'{"height": 130, "weight": 60}',
'{"height": 130, "weight": 60}',
'{"height": 130, "weight": 60}',
],
}
assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
"id": [1, None, 2],
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
"fitness.height": [130, 130, 130],
"fitness.weight": [60, 60, 60],
}