Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of serializer significantly #51

Merged
merged 8 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ __pycache__/
.coverage
.auto-format
/toml-test/
/.benchmarks/
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ sdist-include = ["Cargo.lock"]
[tool.pytest.ini_options]
testpaths = ["tests"]
filterwarnings = ["error"]
xfail_strict = true
# avoid pytest warning when pytest-speed is not installed
markers = ["benchmark"]

Expand Down
262 changes: 143 additions & 119 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
extern crate pyo3;

use crate::py_type::PyTypeLookup;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyDateTime, PyDict, PyFloat, PyList, PyTuple};
use pyo3::types::{PyAny, PyDate, PyDateTime, PyDict, PyList, PyString, PyTime, PyTuple};
use pyo3::{create_exception, wrap_pyfunction, PyErrArguments};
use serde::ser::{self, Serialize, SerializeMap, SerializeSeq, Serializer};
use serde::ser::{Error as SerError, Serialize, SerializeMap, SerializeSeq, Serializer};
use std::fmt;
use std::str::FromStr;
use toml::Value::{Array, Boolean, Datetime, Float, Integer, String as TomlString, Table};
use toml::{to_string as to_toml_string, to_string_pretty as to_toml_string_pretty, Value};
Expand All @@ -14,6 +16,7 @@ use toml::{to_string as to_toml_string, to_string_pretty as to_toml_string_prett
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

mod datetime;
mod py_type;

const VERSION: &str = env!("CARGO_PKG_VERSION");
create_exception!(_rtoml, TomlParsingError, PyValueError);
Expand Down Expand Up @@ -52,158 +55,179 @@ fn deserialize(py: Python, toml: String) -> PyResult<PyObject> {
}
}

// taken from https://github.com/mre/hyperjson/blob/10d31608584ef4499d6b6b10b6dc9455b358fe3d/src/lib.rs#L287-L402
struct SerializePyObject<'p, 'a> {
py: Python<'p>,
obj: &'a PyAny,
struct SerializePyObject<'py> {
obj: &'py PyAny,
py: Python<'py>,
ob_type_lookup: &'py PyTypeLookup,
}

impl<'p, 'a> Serialize for SerializePyObject<'p, 'a> {
impl<'py> SerializePyObject<'py> {
fn new(py: Python<'py>, obj: &'py PyAny) -> Self {
Self {
obj,
py,
ob_type_lookup: PyTypeLookup::cached(py),
}
}

fn with_obj(&self, obj: &'py PyAny) -> Self {
Self {
obj,
py: self.py,
ob_type_lookup: self.ob_type_lookup,
}
}
}

macro_rules! serde_err {
($msg:expr, $( $msg_args:expr ),+ ) => {
Err(SerError::custom(format!($msg, $( $msg_args ),+ )))
};
}

impl<'py> Serialize for SerializePyObject<'py> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
macro_rules! cast {
($f:expr) => {
if let Ok(val) = PyTryFrom::try_from(self.obj) {
return $f(val);
}
};
}

macro_rules! extract {
macro_rules! serialize {
($t:ty) => {
if let Ok(val) = <$t as FromPyObject>::extract(self.obj) {
return val.serialize(serializer);
match self.obj.extract::<$t>() {
Ok(v) => v.serialize(serializer),
Err(e) => Err(map_py_err(e)),
}
};
}

macro_rules! isa {
($v:ident, $t:ty) => {
$v.is_instance_of::<$t>().map_err(debug_py_err)?
};
}

macro_rules! add_to_map {
($map:ident, $key:ident, $value:ident) => {
if $key.is_none() {
$map.serialize_key("null")?;
} else if let Ok(key) = $key.extract::<bool>() {
$map.serialize_key(if key { "true" } else { "false" })?;
} else if let Ok(key) = $key.str() {
let key = key.to_string();
$map.serialize_key(&key)?;
let lookup = self.ob_type_lookup;
let ob_type = self.obj.get_type_ptr() as usize;
// ugly but this seems to be just marginally faster than a guarded match, also allows for custom cases
// if we wanted to add them
if ob_type == lookup.none {
serializer.serialize_str("null")
} else if ob_type == lookup.int {
serialize!(i64)
} else if ob_type == lookup.bool {
serialize!(bool)
} else if ob_type == lookup.float {
serialize!(f64)
} else if ob_type == lookup.string {
serialize!(&str)
} else if ob_type == lookup.bytes || ob_type == lookup.bytearray {
serialize!(&[u8])
} else if ob_type == lookup.dict {
let py_dict: &PyDict = self.obj.cast_as().map_err(map_py_err)?;

let len = py_dict.len();
let mut simple_items: Vec<(&PyAny, &PyAny)> = Vec::with_capacity(len);
let mut array_items: Vec<(&PyAny, &PyAny)> = Vec::with_capacity(len);
let mut dict_items: Vec<(&PyAny, &PyAny)> = Vec::with_capacity(len);

for (k, v) in py_dict {
if v.cast_as::<PyDict>().is_ok() {
dict_items.push((k, v));
} else if v.cast_as::<PyList>().is_ok() || v.cast_as::<PyTuple>().is_ok() {
array_items.push((k, v));
} else {
return Err(ser::Error::custom(format_args!(
"Dictionary key is not a string: {:?}",
$key
)));
}
$map.serialize_value(&SerializePyObject {
py: self.py,
obj: $value,
})?;
};
}

fn debug_py_err<E: ser::Error>(err: PyErr) -> E {
E::custom(format_args!("{err:?}"))
}

cast!(|x: &PyDict| {
let mut map = serializer.serialize_map(Some(x.len()))?;

// https://github.com/alexcrichton/toml-rs/issues/142#issuecomment-278970591
// taken from alexcrichton/toml-rs/blob/ec4e821f3bb081391801e4c00aa90bf66a53562c/src/value.rs#L364-L387
for (k, v) in x {
if !isa!(v, PyList) && !isa!(v, PyTuple) && !isa!(v, PyDict) {
add_to_map!(map, k, v);
simple_items.push((k, v));
}
}
for (k, v) in x {
if isa!(v, PyList) || isa!(v, PyTuple) {
add_to_map!(map, k, v);
}
let mut map = serializer.serialize_map(Some(len))?;
for (k, v) in simple_items {
let key = table_key(k)?;
let value = self.with_obj(v);
map.serialize_entry(key, &value)?;
}
for (k, v) in x {
if isa!(v, PyDict) {
add_to_map!(map, k, v);
}
for (k, v) in array_items {
let key = table_key(k)?;
let value = self.with_obj(v);
map.serialize_entry(key, &value)?;
}
for (k, v) in dict_items {
let key = table_key(k)?;
let value = self.with_obj(v);
map.serialize_entry(key, &value)?;
}
map.end()
});

macro_rules! to_seq {
($type:ty) => {
cast!(|x: $type| {
let mut seq = serializer.serialize_seq(Some(x.len()))?;
for element in x {
seq.serialize_element(&SerializePyObject {
py: self.py,
obj: element,
})?
}
return seq.end();
});
};
}

to_seq!(&PyList);
to_seq!(&PyTuple);

cast!(|x: &PyDateTime| {
let dt_str: &str = x.str().map_err(debug_py_err)?.extract().map_err(debug_py_err)?;
} else if ob_type == lookup.list {
let py_list: &PyList = self.obj.cast_as().map_err(map_py_err)?;
let mut seq = serializer.serialize_seq(Some(py_list.len()))?;
for element in py_list {
seq.serialize_element(&self.with_obj(element))?
}
seq.end()
} else if ob_type == lookup.tuple {
let py_tuple: &PyTuple = self.obj.cast_as().map_err(map_py_err)?;
let mut seq = serializer.serialize_seq(Some(py_tuple.len()))?;
for element in py_tuple {
seq.serialize_element(&self.with_obj(element))?
}
seq.end()
} else if ob_type == lookup.datetime {
let py_dt: &PyDateTime = self.obj.cast_as().map_err(map_py_err)?;
let dt_str = py_dt.str().map_err(map_py_err)?.to_str().map_err(map_py_err)?;
let iso_str = dt_str.replacen("+00:00", "Z", 1);
match toml::value::Datetime::from_str(&iso_str) {
Ok(dt) => dt.serialize(serializer),
Err(e) => Err(ser::Error::custom(format_args!(
"unable to convert datetime string to toml datetime object {:?}",
e
))),
Err(e) => serde_err!("unable to convert datetime string to TOML datetime object {:?}", e),
}
});

extract!(String);
extract!(bool);
} else if ob_type == lookup.date {
let py_date: &PyDate = self.obj.cast_as().map_err(map_py_err)?;
let date_str = py_date.str().map_err(map_py_err)?.to_str().map_err(map_py_err)?;
match toml::value::Datetime::from_str(date_str) {
Ok(dt) => dt.serialize(serializer),
Err(e) => serde_err!("unable to convert date string to TOML date object {:?}", e),
}
} else if ob_type == lookup.time {
let py_time: &PyTime = self.obj.cast_as().map_err(map_py_err)?;
let time_str = py_time.str().map_err(map_py_err)?.to_str().map_err(map_py_err)?;
match toml::value::Datetime::from_str(time_str) {
Ok(dt) => dt.serialize(serializer),
Err(e) => serde_err!("unable to convert time string to TOML time object {:?}", e),
}
} else {
serde_err!("{} is not serializable to TOML", any_repr(self.obj))
}
}
}

cast!(|x: &PyFloat| x.value().serialize(serializer));
extract!(u64);
extract!(i64);
fn map_py_err<I: fmt::Display, O: SerError>(err: I) -> O {
O::custom(err.to_string())
}

if self.obj.is_none() {
return serializer.serialize_str("null");
}
fn table_key<E: SerError>(key: &PyAny) -> Result<&str, E> {
if let Ok(py_string) = key.cast_as::<PyString>() {
py_string.to_str().map_err(map_py_err)
} else if key.is_none() {
Ok("null")
} else if let Ok(key) = key.extract::<bool>() {
Ok(if key { "true" } else { "false" })
} else {
let key_repr = any_repr(key);
serde_err!("{} is not serializable as a TOML key", key_repr)
}
}

let name = self.obj.get_type().name().map_err(debug_py_err)?;
match self.obj.repr() {
Ok(repr) => Err(ser::Error::custom(format_args!(
"{} is not serializable to TOML: {}",
name, repr,
))),
Err(_) => Err(ser::Error::custom(format_args!("{name} is not serializable to TOML"))),
}
fn any_repr(obj: &PyAny) -> String {
let name = obj.get_type().name().unwrap_or("unknown");
match obj.repr() {
Ok(repr) => format!("{repr} ({name})"),
Err(_) => name.to_string(),
}
}

#[pyfunction]
fn serialize(py: Python, obj: PyObject) -> PyResult<String> {
let s = SerializePyObject {
py,
obj: obj.extract(py)?,
};
fn serialize(py: Python, obj: &PyAny) -> PyResult<String> {
let s = SerializePyObject::new(py, obj);
match to_toml_string(&s) {
Ok(s) => Ok(s),
Err(e) => Err(TomlSerializationError::new_err(e.to_string())),
}
}

#[pyfunction]
fn serialize_pretty(py: Python, obj: PyObject) -> PyResult<String> {
let s = SerializePyObject {
py,
obj: obj.extract(py)?,
};
fn serialize_pretty(py: Python, obj: &PyAny) -> PyResult<String> {
let s = SerializePyObject::new(py, obj);
match to_toml_string_pretty(&s) {
Ok(s) => Ok(s),
Err(e) => Err(TomlSerializationError::new_err(e.to_string())),
Expand Down
61 changes: 61 additions & 0 deletions src/py_type.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
use pyo3::once_cell::GILOnceCell;
use pyo3::prelude::*;
use pyo3::types::{PyByteArray, PyBytes, PyDate, PyDateTime, PyDelta, PyDict, PyList, PyString, PyTime, PyTuple};

#[derive(Clone)]
#[cfg_attr(debug_assertions, derive(Debug))]
pub struct PyTypeLookup {
pub none: usize,
// numeric types
pub int: usize,
pub bool: usize,
pub float: usize,
// string types
pub string: usize,
pub bytes: usize,
pub bytearray: usize,
// sequence types
pub list: usize,
pub tuple: usize,
// mapping types
pub dict: usize,
// datetime types
pub datetime: usize,
pub date: usize,
pub time: usize,
pub timedelta: usize,
}

static TYPE_LOOKUP: GILOnceCell<PyTypeLookup> = GILOnceCell::new();

impl PyTypeLookup {
fn new(py: Python) -> Self {
Self {
none: py.None().as_ref(py).get_type_ptr() as usize,
// numeric types
int: 0i32.into_py(py).as_ref(py).get_type_ptr() as usize,
bool: true.into_py(py).as_ref(py).get_type_ptr() as usize,
float: 0f32.into_py(py).as_ref(py).get_type_ptr() as usize,
// string types
string: PyString::new(py, "s").get_type_ptr() as usize,
bytes: PyBytes::new(py, b"s").get_type_ptr() as usize,
bytearray: PyByteArray::new(py, b"s").get_type_ptr() as usize,
// sequence types
list: PyList::empty(py).get_type_ptr() as usize,
tuple: PyTuple::empty(py).get_type_ptr() as usize,
// mapping types
dict: PyDict::new(py).get_type_ptr() as usize,
// datetime types
datetime: PyDateTime::new(py, 2000, 1, 1, 0, 0, 0, 0, None)
.unwrap()
.get_type_ptr() as usize,
date: PyDate::new(py, 2000, 1, 1).unwrap().get_type_ptr() as usize,
time: PyTime::new(py, 0, 0, 0, 0, None).unwrap().get_type_ptr() as usize,
timedelta: PyDelta::new(py, 0, 0, 0, false).unwrap().get_type_ptr() as usize,
}
}

pub fn cached(py: Python<'_>) -> &PyTypeLookup {
TYPE_LOOKUP.get_or_init(py, || PyTypeLookup::new(py))
}
}
Loading