Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python)!: Implement binary serialization of LazyFrame/DataFrame/Expr and set it as the default format #17223

Merged
merged 18 commits into from
Jun 28, 2024
Merged
2 changes: 1 addition & 1 deletion crates/polars-core/src/datatypes/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ impl<'de> serde::Deserialize<'de> for Wrap<Utf8ViewArray> {
{
let mut utf8array = MutablePlString::with_capacity(seq.size_hint().unwrap_or(10));
while let Some(key) = seq.next_element()? {
let key: Option<&str> = key;
let key: Option<String> = key;
utf8array.push(key)
}
Ok(Wrap(utf8array.into()))
Expand Down
63 changes: 63 additions & 0 deletions py-polars/polars/_utils/serde.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Utility for serializing Polars objects."""

from __future__ import annotations

from io import BytesIO, StringIO
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Literal, overload

from polars._utils.various import normalize_filepath

if TYPE_CHECKING:
from io import IOBase

from polars.type_aliases import SerializationFormat


@overload
def serialize_polars_object(
serializer: Callable[[IOBase | str], None], file: None, format: Literal["binary"]
) -> bytes: ...
@overload
def serialize_polars_object(
serializer: Callable[[IOBase | str], None], file: None, format: Literal["json"]
) -> str: ...
@overload
def serialize_polars_object(
serializer: Callable[[IOBase | str], None],
file: IOBase | str | Path,
format: SerializationFormat,
) -> None: ...


def serialize_polars_object(
serializer: Callable[[IOBase | str], None],
file: IOBase | str | Path | None,
format: SerializationFormat,
) -> bytes | str | None:
"""Serialize a Polars object (DataFrame/LazyFrame/Expr)."""

def serialize_to_bytes() -> bytes:
with BytesIO() as buf:
serializer(buf)
serialized = buf.getvalue()
return serialized

if file is None:
serialized = serialize_to_bytes()
return serialized.decode() if format == "json" else serialized
elif isinstance(file, StringIO):
serialized_str = serialize_to_bytes().decode()
file.write(serialized_str)
return None
elif isinstance(file, BytesIO):
serialized = serialize_to_bytes()
file.write(serialized)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
serializer(file)
return None
else:
serializer(file)
return None
117 changes: 83 additions & 34 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
)
from polars._utils.getitem import get_df_item_by_key
from polars._utils.parse import parse_into_expression
from polars._utils.serde import serialize_polars_object
from polars._utils.unstable import issue_unstable_warning, unstable
from polars._utils.various import (
is_bool_sequence,
Expand Down Expand Up @@ -100,7 +101,11 @@
from polars.functions import col, lit
from polars.schema import Schema
from polars.selectors import _expand_selector_dicts, _expand_selectors
from polars.type_aliases import DbWriteMode, JaxExportType, TorchExportType
from polars.type_aliases import (
DbWriteMode,
JaxExportType,
TorchExportType,
)

with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import PyDataFrame
Expand Down Expand Up @@ -159,6 +164,7 @@
SchemaDefinition,
SchemaDict,
SelectorType,
SerializationFormat,
SingleColSelector,
SingleIndexSelector,
SizeUnit,
Expand Down Expand Up @@ -416,29 +422,39 @@ def __init__(
raise TypeError(msg)

@classmethod
def deserialize(cls, source: str | Path | IOBase) -> DataFrame:
def deserialize(
cls, source: str | Path | IOBase, *, format: SerializationFormat = "binary"
) -> DataFrame:
"""
Read a serialized DataFrame from a file.

.. versionadded:: 0.20.31

Parameters
----------
source
Path to a file or a file-like object (by file-like object, we refer to
objects that have a `read()` method, such as a file handler (e.g.
via builtin `open` function) or `BytesIO`).
format
The format with which the DataFrame was serialized. Options:

- `"binary"`: Deserialize from binary format (bytes). This is the default.
- `"json"`: Deserialize from JSON format (string).

See Also
--------
DataFrame.serialize

Notes
-----
Serialization is not stable across Polars versions: a LazyFrame serialized
in one Polars version may not be deserializable in another Polars version.

Examples
--------
>>> import io
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
>>> json = df.serialize()
>>> pl.DataFrame.deserialize(io.StringIO(json))
>>> bytes = df.serialize()
>>> pl.DataFrame.deserialize(io.BytesIO(bytes))
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
Expand All @@ -455,7 +471,15 @@ def deserialize(cls, source: str | Path | IOBase) -> DataFrame:
elif isinstance(source, (str, Path)):
source = normalize_filepath(source)

return cls._from_pydf(PyDataFrame.deserialize(source))
if format == "binary":
deserializer = PyDataFrame.deserialize_binary
elif format == "json":
deserializer = PyDataFrame.deserialize_json
else:
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
raise ValueError(msg)

return cls._from_pydf(deserializer(source))

@classmethod
def _from_pydf(cls, py_df: PyDataFrame) -> DataFrame:
Expand Down Expand Up @@ -2351,54 +2375,79 @@ def to_init_repr(self, n: int = 1000) -> str:
return output.getvalue()

@overload
def serialize(self, file: None = ...) -> str: ...

def serialize(
self, file: None = ..., *, format: Literal["binary"] = ...
) -> bytes: ...
@overload
def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...
@overload
def serialize(self, file: IOBase | str | Path) -> None: ...
def serialize(
self, file: IOBase | str | Path, *, format: SerializationFormat = ...
) -> None: ...

def serialize(self, file: IOBase | str | Path | None = None) -> str | None:
"""
def serialize(
self,
file: IOBase | str | Path | None = None,
*,
format: SerializationFormat = "binary",
) -> bytes | str | None:
r"""
Serialize this DataFrame to a file or string in JSON format.

.. versionadded:: 0.20.31

Parameters
----------
file
File path or writable file-like object to which the result will be written.
If set to `None` (default), the output is returned as a string instead.
format
The format in which to serialize. Options:

- `"binary"`: Serialize to binary format (bytes). This is the default.
- `"json"`: Serialize to JSON format (string).

Notes
-----
Serialization is not stable across Polars versions: a LazyFrame serialized
in one Polars version may not be deserializable in another Polars version.

Examples
--------
Serialize the DataFrame into a binary representation.

>>> df = pl.DataFrame(
... {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... }
... )
>>> df.serialize()
'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}'
"""
>>> bytes = df.serialize()
>>> bytes # doctest: +ELLIPSIS
b'\xa1gcolumns\x82\xa4dnamecfoohdatatypeeInt64lbit_settings\x00fvalues\x83...'

def serialize_to_string() -> str:
with BytesIO() as buf:
self._df.serialize(buf)
json_bytes = buf.getvalue()
return json_bytes.decode("utf8")
The bytes can later be deserialized back into a DataFrame.

if file is None:
return serialize_to_string()
elif isinstance(file, StringIO):
json_str = serialize_to_string()
file.write(json_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._df.serialize(file)
return None
>>> import io
>>> pl.DataFrame.deserialize(io.BytesIO(bytes))
shape: (3, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 6 │
│ 2 ┆ 7 │
│ 3 ┆ 8 │
└─────┴─────┘
"""
if format == "binary":
serializer = self._df.serialize_binary
elif format == "json":
serializer = self._df.serialize_json
else:
self._df.serialize(file)
return None
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
raise ValueError(msg)

return serialize_polars_object(serializer, file, format)

@overload
def write_json(self, file: None = ...) -> str: ...
Expand Down
37 changes: 28 additions & 9 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
RankMethod,
RollingInterpolationMethod,
SearchSortedSide,
SerializationFormat,
TemporalLiteral,
WindowMappingStrategy,
)
Expand Down Expand Up @@ -328,7 +329,9 @@ def function(s: Series) -> Series: # pragma: no cover
return root_expr.map_batches(function, is_elementwise=True).meta.undo_aliases()

@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Expr:
def deserialize(
cls, source: str | Path | IOBase, *, format: SerializationFormat = "binary"
) -> Expr:
"""
Read a serialized expression from a file.

Expand All @@ -338,33 +341,49 @@ def deserialize(cls, source: str | Path | IOBase) -> Expr:
Path to a file or a file-like object (by file-like object, we refer to
objects that have a `read()` method, such as a file handler (e.g.
via builtin `open` function) or `BytesIO`).
format
The format with which the Expr was serialized. Options:

- `"binary"`: Deserialize from binary format (bytes). This is the default.
- `"json"`: Deserialize from JSON format (string).

Warnings
--------
This function uses :mod:`pickle` when the logical plan contains Python UDFs,
This function uses :mod:`pickle` if the logical plan contains Python UDFs,
and as such inherits the security implications. Deserializing can execute
arbitrary code, so it should only be attempted on trusted data.

See Also
--------
Expr.meta.serialize

Notes
-----
Serialization is not stable across Polars versions: a LazyFrame serialized
in one Polars version may not be deserializable in another Polars version.

Examples
--------
>>> from io import StringIO
>>> import io
>>> expr = pl.col("foo").sum().over("bar")
>>> json = expr.meta.serialize()
>>> pl.Expr.deserialize(StringIO(json)) # doctest: +ELLIPSIS
>>> bytes = expr.meta.serialize()
>>> pl.Expr.deserialize(io.BytesIO(bytes)) # doctest: +ELLIPSIS
<Expr ['col("foo").sum().over([col("ba…'] at ...>
"""
if isinstance(source, StringIO):
source = BytesIO(source.getvalue().encode())
elif isinstance(source, (str, Path)):
source = normalize_filepath(source)

expr = cls.__new__(cls)
expr._pyexpr = PyExpr.deserialize(source)
return expr
if format == "binary":
deserializer = PyExpr.deserialize_binary
elif format == "json":
deserializer = PyExpr.deserialize_json
else:
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
raise ValueError(msg)

return cls._from_pyexpr(deserializer(source))

def to_physical(self) -> Expr:
"""
Expand Down Expand Up @@ -10479,7 +10498,7 @@ def from_json(cls, value: str) -> Expr:
" Enclose your input in `io.StringIO` to keep the same behavior.",
version="0.20.11",
)
return cls.deserialize(StringIO(value))
return cls.deserialize(StringIO(value), format="json")

@property
def bin(self) -> ExprBinaryNameSpace:
Expand Down
Loading