Skip to content

Commit

Permalink
feat(python)!: Remove serde functionality from pl.read_json and `Da…
Browse files Browse the repository at this point in the history
…taFrame.write_json` (#16550)
  • Loading branch information
stinodego authored Jun 4, 2024
1 parent f7f7d07 commit b61d4e6
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 161 deletions.
3 changes: 3 additions & 0 deletions docs/src/python/user-guide/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
df.write_json("docs/data/path.json")
# --8<-- [end:write]

"""
# --8<-- [start:scan]
df = pl.scan_ndjson("docs/data/path.json")
# --8<-- [end:scan]
"""
64 changes: 10 additions & 54 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2295,30 +2295,12 @@ def serialize_to_string() -> str:
return None

@overload
def write_json(
self,
file: None = ...,
*,
row_oriented: bool = ...,
pretty: bool | None = ...,
) -> str: ...
def write_json(self, file: None = ...) -> str: ...

@overload
def write_json(
self,
file: IOBase | str | Path,
*,
row_oriented: bool = ...,
pretty: bool | None = ...,
) -> None: ...
def write_json(self, file: IOBase | str | Path) -> None: ...

def write_json(
self,
file: IOBase | str | Path | None = None,
*,
row_oriented: bool = False,
pretty: bool | None = None,
) -> str | None:
def write_json(self, file: IOBase | str | Path | None = None) -> str | None:
"""
Serialize to JSON representation.
Expand All @@ -2327,17 +2309,6 @@ def write_json(
file
File path or writable file-like object to which the result will be written.
If set to `None` (default), the output is returned as a string instead.
row_oriented
Write to row oriented json. This is slower, but more common.
pretty
Pretty serialize json.
.. deprecated:: 0.20.31
The `pretty` functionality for `write_json` will be removed in the next
breaking release. Use :meth:`serialize` to serialize the DataFrame in
the regular JSON format.
See Also
--------
Expand All @@ -2351,43 +2322,28 @@ def write_json(
... "bar": [6, 7, 8],
... }
... )
>>> df.write_json(row_oriented=True)
>>> df.write_json()
'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]'
"""
if pretty is not None:
issue_deprecation_warning(
"The `pretty` functionality for `write_json` will be removed in the next breaking release."
" Use `DataFrame.serialize` to serialize the DataFrame in the regular JSON format.",
version="0.20.31",
)
else:
pretty = False

if not row_oriented:
issue_deprecation_warning(
"`DataFrame.write_json` will only write row-oriented JSON in the next breaking release."
" Use `DataFrame.serialize` instead.",
version="0.20.31",
)

def write_json_to_string(*, pretty: bool, row_oriented: bool) -> str:
def write_json_to_string() -> str:
with BytesIO() as buf:
self._df.write_json_old(buf, pretty=pretty, row_oriented=row_oriented)
self._df.write_json(buf)
json_bytes = buf.getvalue()
return json_bytes.decode("utf8")

if file is None:
return write_json_to_string(pretty=pretty, row_oriented=row_oriented)
return write_json_to_string()
elif isinstance(file, StringIO):
json_str = write_json_to_string(pretty=pretty, row_oriented=row_oriented)
json_str = write_json_to_string()
file.write(json_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._df.write_json_old(file, pretty=pretty, row_oriented=row_oriented)
self._df.write_json(file)
return None
else:
self._df.write_json_old(file, pretty=pretty, row_oriented=row_oriented)
self._df.write_json(file)
return None

@overload
Expand Down
3 changes: 0 additions & 3 deletions py-polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,6 @@ filterwarnings = [
# TODO: Remove when behavior is updated
# https://github.com/pola-rs/polars/issues/13441
"ignore:.*default coalesce behavior of left join.*:DeprecationWarning",
# TODO: Remove when default is updated
# https://github.com/pola-rs/polars/issues/14526
"ignore:.*will only write row-oriented JSON.*:DeprecationWarning",
]
xfail_strict = true

Expand Down
68 changes: 15 additions & 53 deletions py-polars/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,44 +215,27 @@ impl PyDataFrame {
schema: Option<Wrap<Schema>>,
schema_overrides: Option<Wrap<Schema>>,
) -> PyResult<Self> {
// memmap the file first.

use crate::file::read_if_bytesio;
py_f = read_if_bytesio(py_f);
let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?;

py.allow_threads(move || {
let mmap_read: ReaderBytes = (&mmap_bytes_r).into();
let bytes = mmap_read.deref();
// Happy path is our column oriented json as that is most performant,
// on failure we try the arrow json reader instead, which is row-oriented.
match serde_json::from_slice::<DataFrame>(bytes) {
Ok(df) => Ok(df.into()),
Err(e) => {
let msg = format!("{e}");
if msg.contains("successful parse invalid data") {
let e = PyPolarsErr::from(PolarsError::ComputeError(msg.into()));
Err(PyErr::from(e))
} else {
let mut builder = JsonReader::new(mmap_bytes_r)
.with_json_format(JsonFormat::Json)
.infer_schema_len(infer_schema_length);

if let Some(schema) = schema {
builder = builder.with_schema(Arc::new(schema.0));
}

if let Some(schema) = schema_overrides.as_ref() {
builder = builder.with_schema_overwrite(&schema.0);
}

let out = builder
.finish()
.map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
Ok(out.into())
}
},
let mut builder = JsonReader::new(mmap_bytes_r)
.with_json_format(JsonFormat::Json)
.infer_schema_len(infer_schema_length);

if let Some(schema) = schema {
builder = builder.with_schema(Arc::new(schema.0));
}

if let Some(schema) = schema_overrides.as_ref() {
builder = builder.with_schema_overwrite(&schema.0);
}

let out = builder
.finish()
.map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
Ok(out.into())
})
}

Expand Down Expand Up @@ -493,27 +476,6 @@ impl PyDataFrame {
.map_err(|e| PyPolarsErr::Other(format!("{e}")).into())
}

/// This method can be removed entirely in the next breaking release.
#[cfg(feature = "json")]
pub fn write_json_old(
&mut self,
py_f: PyObject,
pretty: bool,
row_oriented: bool,
) -> PyResult<()> {
match (pretty, row_oriented) {
(_, true) => self.write_json(py_f),
(false, _) => self.serialize(py_f),
(true, _) => {
let file = BufWriter::new(get_file_like(py_f, true)?);

serde_json::to_writer_pretty(file, &self.df)
.map_err(|e| polars_err!(ComputeError: "{e}"))
.map_err(|e| PyPolarsErr::Other(format!("{e}")).into())
},
}
}

#[cfg(feature = "json")]
pub fn write_ndjson(&mut self, py_f: PyObject) -> PyResult<()> {
let file = BufWriter::new(get_file_like(py_f, true)?);
Expand Down
41 changes: 29 additions & 12 deletions py-polars/tests/unit/dataframe/test_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,32 @@ def test_json_deserialize_empty_list_10458() -> None:
assert df.schema == schema


def test_df_write_json_deprecated() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
with pytest.deprecated_call():
result = df.write_json()
assert result == df.serialize()


def test_df_write_json_pretty_deprecated() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
with pytest.deprecated_call():
result = df.write_json(pretty=True)
assert isinstance(result, str)
def test_serde_validation() -> None:
f = io.StringIO(
"""
{
"columns": [
{
"name": "a",
"datatype": "Int64",
"values": [
1,
2
]
},
{
"name": "b",
"datatype": "Int64",
"values": [
1
]
}
]
}
"""
)
with pytest.raises(
pl.ComputeError,
match=r"lengths don't match",
):
pl.DataFrame.deserialize(f)
16 changes: 8 additions & 8 deletions py-polars/tests/unit/io/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
from polars.testing import assert_frame_equal


def test_write_json_row_oriented() -> None:
def test_write_json() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
out = df.write_json(row_oriented=True)
out = df.write_json()
assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]'

# Test round trip
Expand All @@ -27,11 +27,10 @@ def test_write_json_row_oriented() -> None:
def test_write_json_categoricals() -> None:
data = {"column": ["test1", "test2", "test3", "test4"]}
df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical))

assert (
df.write_json(row_oriented=True, file=None)
== '[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]'
expected = (
'[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]'
)
assert df.write_json() == expected


def test_write_json_duration() -> None:
Expand All @@ -44,8 +43,9 @@ def test_write_json_duration() -> None:
)

# we don't guarantee a format, just round-circling
value = str(df.write_json(row_oriented=True))
assert value == """[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]"""
value = df.write_json()
expected = '[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]'
assert value == expected


def test_json_infer_schema_length_11148() -> None:
Expand Down
31 changes: 0 additions & 31 deletions py-polars/tests/unit/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,37 +538,6 @@ def test_invalid_group_by_arg() -> None:
df.group_by(1).agg({"a": "sum"})


def test_serde_validation() -> None:
f = io.StringIO(
"""
{
"columns": [
{
"name": "a",
"datatype": "Int64",
"values": [
1,
2
]
},
{
"name": "b",
"datatype": "Int64",
"values": [
1
]
}
]
}
"""
)
with pytest.raises(
pl.ComputeError,
match=r"lengths don't match",
):
pl.read_json(f)


def test_overflow_msg() -> None:
with pytest.raises(
pl.ComputeError,
Expand Down

0 comments on commit b61d4e6

Please sign in to comment.