Make pydantic model serialization consistent regardless of surrogates.

Without this code, Pydantic models containing surrogates get serialized differently than models that don't contain surrogates. This leads to a less smooth user experience in LangSmith for users whose data contains surrogates. With this fix, Pydantic models and other tricky Python data types are always serialized in the same way, regardless of whether they contain surrogates or not.
langchain-ai · Jan 10, 2025 · bb68539 · bb68539
1 parent c36cf67
commit bb68539
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 1 deletion.
diff --git a/python/langsmith/_internal/_serde.py b/python/langsmith/_internal/_serde.py
@@ -146,7 +146,7 @@ def dumps_json(obj: Any) -> bytes:
         logger.debug(f"Orjson serialization failed: {repr(e)}. Falling back to json.")
         result = json.dumps(
             obj,
-            default=_simple_default,
+            default=_serialize_json,
             ensure_ascii=True,
         ).encode("utf-8")
         try:

diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
@@ -2,6 +2,7 @@
 
 import datetime
 import io
+import json
 import logging
 import os
 import random
@@ -19,6 +20,7 @@
 from pydantic import BaseModel
 from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
 
+from langsmith._internal._serde import dumps_json
 from langsmith.client import ID_TYPE, Client
 from langsmith.evaluation import aevaluate, evaluate
 from langsmith.schemas import (
@@ -1155,6 +1157,37 @@ def test_surrogates():
     )
 
 
+def test_fallback_json_serialization():
+    class Document(BaseModel):
+        content: str
+
+    raw_surrogates = [
+        ("Hello\ud83d\ude00", "Hello😀"),
+        ("Python\ud83d\udc0d", "Python🐍"),
+        ("Surrogate\ud834\udd1e", "Surrogate𝄞"),
+        ("Example\ud83c\udf89", "Example🎉"),
+        ("String\ud83c\udfa7", "String🎧"),
+        ("With\ud83c\udf08", "With🌈"),
+        ("Surrogates\ud83d\ude0e", "Surrogates😎"),
+        ("Embedded\ud83d\udcbb", "Embedded💻"),
+        ("In\ud83c\udf0e", "In🌎"),
+        ("The\ud83d\udcd6", "The📖"),
+        ("Text\ud83d\udcac", "Text💬"),
+        ("收花🙄·到", "收花🙄·到"),
+    ]
+    pydantic_surrogates = [
+        (Document(content=item), expected) for item, expected in raw_surrogates
+    ]
+
+    for item, expected in raw_surrogates:
+        output = dumps_json(item).decode("utf8")
+        assert f'"{expected}"' == output
+
+    for item, expected in pydantic_surrogates:
+        output = dumps_json(item).decode("utf8")
+        assert f'{{"content":"{expected}"}}' == output
+
+
 def test_runs_stats():
     langchain_client = Client()
     # We always have stuff in the "default" project...