Skip to content

Commit

Permalink
Make pydantic model serialization consistent regardless of surrogates.
Browse files Browse the repository at this point in the history
Without this code, Pydantic models containing surrogates get serialized differently than models that don't contain surrogates. This leads to a less smooth user experience in LangSmith for users whose data contains surrogates.

With this fix, Pydantic models and other tricky Python data types are always serialized in the same way, regardless of whether they contain surrogates or not.
  • Loading branch information
obi1kenobi committed Jan 10, 2025
1 parent c36cf67 commit bb68539
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 1 deletion.
2 changes: 1 addition & 1 deletion python/langsmith/_internal/_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def dumps_json(obj: Any) -> bytes:
logger.debug(f"Orjson serialization failed: {repr(e)}. Falling back to json.")
result = json.dumps(
obj,
default=_simple_default,
default=_serialize_json,
ensure_ascii=True,
).encode("utf-8")
try:
Expand Down
33 changes: 33 additions & 0 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datetime
import io
import json
import logging
import os
import random
Expand All @@ -19,6 +20,7 @@
from pydantic import BaseModel
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor

from langsmith._internal._serde import dumps_json
from langsmith.client import ID_TYPE, Client
from langsmith.evaluation import aevaluate, evaluate
from langsmith.schemas import (
Expand Down Expand Up @@ -1155,6 +1157,37 @@ def test_surrogates():
)


def test_fallback_json_serialization():
class Document(BaseModel):
content: str

raw_surrogates = [
("Hello\ud83d\ude00", "Hello😀"),
("Python\ud83d\udc0d", "Python🐍"),
("Surrogate\ud834\udd1e", "Surrogate𝄞"),
("Example\ud83c\udf89", "Example🎉"),
("String\ud83c\udfa7", "String🎧"),
("With\ud83c\udf08", "With🌈"),
("Surrogates\ud83d\ude0e", "Surrogates😎"),
("Embedded\ud83d\udcbb", "Embedded💻"),
("In\ud83c\udf0e", "In🌎"),
("The\ud83d\udcd6", "The📖"),
("Text\ud83d\udcac", "Text💬"),
("收花🙄·到", "收花🙄·到"),
]
pydantic_surrogates = [
(Document(content=item), expected) for item, expected in raw_surrogates
]

for item, expected in raw_surrogates:
output = dumps_json(item).decode("utf8")
assert f'"{expected}"' == output

for item, expected in pydantic_surrogates:
output = dumps_json(item).decode("utf8")
assert f'{{"content":"{expected}"}}' == output


def test_runs_stats():
langchain_client = Client()
# We always have stuff in the "default" project...
Expand Down

0 comments on commit bb68539

Please sign in to comment.