fix: working TList serialization (#763)

* fix: initialize empty `TObject` members on `to_TObjString` * add test for serialization of `TObjString` * remove unused dependency on test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add `tojson` method to `TObjString` * add additional check to `TObjString` write test * fix bad field in `TList` tojson conversion * add inexpensive `assert` to `TList` serialization * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bad serialization of non-empty TList due to options (#763 (comment)) * add tests for TList serialization * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed bad `__repr__` for `TObject` * add serialization of `fUniqueID` to `TObject` * add `empty` method to `TObject` * remove redundant `TObject` member initialization * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update src/uproot/writing/identify.py Co-authored-by: Jim Pivarski <jpivarski@users.noreply.github.com> * moved `TList` serialization list to `serialize` method * add helper serialization method `bytestring` as suggested in #763 (comment) by @agoose77 * keep `TList` `_options` as python `bytes` and update serialization to use the new `bytestring` helper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reset `serialization.py` to `main` branch status * Revert "keep `TList` `_options` as python `bytes` and update serialization to use the new `bytestring` helper" This reverts commit 8e6ad2e. * Revert "[pre-commit.ci] auto fixes from pre-commit.com hooks" This reverts commit 897972f. # Conflicts: # src/uproot/serialization.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "reset `serialization.py` to `main` branch status" This reverts commit 9a500c1. * Revert "Revert "keep `TList` `_options` as python `bytes` and update serialization to use the new `bytestring` helper"" This reverts commit 884ec4d. * Update src/uproot/models/TObject.py Co-authored-by: Angus Hollands <goosey15@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski <jpivarski@users.noreply.github.com> Co-authored-by: Angus Hollands <goosey15@gmail.com>
scikit-hep · Oct 28, 2022 · cb8c776 · cb8c776
1 parent 33777b1
commit cb8c776
Show file tree

Hide file tree

Showing 7 changed files with 124 additions and 16 deletions.
diff --git a/src/uproot/models/TList.py b/src/uproot/models/TList.py
@@ -81,7 +81,7 @@ def tojson(self):
             "_typename": "TList",
             "name": "TList",
             "arr": [x.tojson() for x in self._data],
-            "opt": [],
+            "opt": ["" for _ in self._data],
         }
 
     writable = True
@@ -91,6 +91,10 @@ def _to_writable_postprocess(self, original):
         self._options = original._options
 
     def _serialize(self, out, header, name, tobject_flags):
+        assert (
+            self._members["fSize"] == len(self._data) == len(self._options)
+        ), "Fatal error in TList serialization."
+
         import uproot.writing._cascade
 
         where = len(out)
@@ -102,7 +106,7 @@ def _serialize(self, out, header, name, tobject_flags):
 
         for datum, option in zip(self._data, self._options):
             uproot.serialization._serialize_object_any(out, datum, None)
-            out.append(option)
+            out.append(uproot.serialization.bytestring(option))
 
         if header:
             num_bytes = sum(len(x) for x in out[where:])

diff --git a/src/uproot/models/TObjString.py b/src/uproot/models/TObjString.py
@@ -64,6 +64,12 @@ def fTitle(self):
 
     writable = True
 
+    def tojson(self):
+        out = self._bases[0].tojson()  # TObject
+        out["_typename"] = self.classname
+        out["fString"] = str(self)
+        return out
+
     def _serialize(self, out, header, name, tobject_flags):
         where = len(out)
         for x in self._bases:

diff --git a/src/uproot/models/TObject.py b/src/uproot/models/TObject.py
@@ -69,7 +69,10 @@ def read_members(self, chunk, cursor, context, file):
     writable = True
 
     def _serialize(self, out, header, name, tobject_flags):
-        out.append(b"\x00\x01" + _tobject_format2.pack(0, tobject_flags))
+        out.append(
+            b"\x00\x01"
+            + _tobject_format2.pack(self.member("@fUniqueID"), tobject_flags)
+        )
 
     @classmethod
     def strided_interpretation(
@@ -114,7 +117,7 @@ def awkward_form(cls, file, context):
 
     def __repr__(self):
         return "<TObject {} {} at 0x{:012x}>".format(
-            self._members.get("fUniqueID"), self._members.get("fBits"), id(self)
+            self.member("@fUniqueID"), self.member("@fBits"), id(self)
         )
 
     def tojson(self):
@@ -124,5 +127,12 @@ def tojson(self):
             "fBits": self.member("@fBits"),
         }
 
+    @classmethod
+    def empty(cls):
+        self = super().empty()
+        self._members["@fUniqueID"] = 0
+        self._members["@fBits"] = 0
+        return self
+
 
 uproot.classes["TObject"] = Model_TObject
diff --git a/src/uproot/serialization.py b/src/uproot/serialization.py
@@ -24,12 +24,22 @@ def string(data):
     is preceded by a 1-byte length; otherwise, it is preceded by ``b'\xff'`` and a
     4-byte length.
     """
-    bytestring = data.encode(errors="surrogateescape")
-    length = len(bytestring)
+    return bytestring(data.encode(errors="surrogateescape"))
+
+
+def bytestring(data):
+    """
+    Converts Python bytes into a length-prefixed bytestring, ready to be written to a file.
+
+    If the string's byte representation (UTF-8) has fewer than 255 bytes, it
+    is preceded by a 1-byte length; otherwise, it is preceded by ``b'\xff'`` and a
+    4-byte length.
+    """
+    length = len(data)
     if length < 255:
-        return struct.pack(">B%ds" % length, length, bytestring)
+        return struct.pack(">B%ds" % length, length, data)
     else:
-        return struct.pack(">BI%ds" % length, 255, length, bytestring)
+        return struct.pack(">BI%ds" % length, 255, length, data)
 
 
 def numbytes_version(num_bytes, version):

diff --git a/src/uproot/writing/identify.py b/src/uproot/writing/identify.py
@@ -735,12 +735,14 @@ def to_TObjString(string):
     This function is for developers to create TObjString objects that can be
     written to ROOT files, to implement conversion routines.
     """
+    tobject = uproot.models.TObject.Model_TObject.empty()
+
     tobjstring = uproot.models.TObjString.Model_TObjString(str(string))
     tobjstring._deeply_writable = True
     tobjstring._cursor = None
     tobjstring._parent = None
     tobjstring._members = {}
-    tobjstring._bases = (uproot.models.TObject.Model_TObject(),)
+    tobjstring._bases = (tobject,)
     tobjstring._num_bytes = len(string) + (1 if len(string) < 255 else 5) + 16
     tobjstring._instance_version = 1
     return tobjstring
@@ -761,8 +763,6 @@ def to_TList(data, name=""):
         )
 
     tobject = uproot.models.TObject.Model_TObject.empty()
-    tobject._members["@fUniqueID"] = 0
-    tobject._members["@fBits"] = 0
 
     tlist = uproot.models.TList.Model_TList.empty()
     tlist._bases.append(tobject)
@@ -874,8 +874,6 @@ def to_TAxis(
     written to ROOT files, to implement conversion routines.
     """
     tobject = uproot.models.TObject.Model_TObject.empty()
-    tobject._members["@fUniqueID"] = 0
-    tobject._members["@fBits"] = 0
 
     tnamed = uproot.models.TNamed.Model_TNamed.empty()
     tnamed._deeply_writable = True
@@ -1018,8 +1016,6 @@ def to_TH1x(
     TH1C, TH1D, TH1F, TH1I, or TH1S depends on the dtype of the ``data`` array.
     """
     tobject = uproot.models.TObject.Model_TObject.empty()
-    tobject._members["@fUniqueID"] = 0
-    tobject._members["@fBits"] = 0
 
     tnamed = uproot.models.TNamed.Model_TNamed.empty()
     tnamed._deeply_writable = True

diff --git a/tests/test_0349-write-TObjString.py b/tests/test_0349-write-TObjString.py
@@ -2,7 +2,6 @@
 
 import os
 
-import numpy as np
 import pytest
 
 import uproot
@@ -78,3 +77,22 @@ def test_update(tmp_path):
         assert f6["subdir/wowie"] == "wowie"
         assert f6["subdir/zowie"] == "zowie"
         assert list(f6.file.streamers) == ["TObjString"]
+
+
+def test_serialization(tmp_path):
+    filename = os.path.join(tmp_path, "whatever.root")
+
+    string = "hey"
+    tobjstring = uproot.writing.identify.to_TObjString(string)
+    assert (
+        tobjstring.tojson()["_typename"] == "TObjString"
+    )  # https://github.com/scikit-hep/uproot5/issues/762
+    assert tobjstring.tojson()["fString"] == str(tobjstring)
+
+    with uproot.recreate(filename) as f1:
+        f1["first"] = tobjstring
+        f1["second"] = str(tobjstring)  # also checks conversion to "str"
+
+    with uproot.open(filename) as f2:
+        assert f2["first"] == f2["second"]
+        assert str(f2["first"]) == string
diff --git a/tests/test_0351-write-TList.py b/tests/test_0351-write-TList.py
@@ -0,0 +1,64 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/uproot4/blob/main/LICENSE
+
+import os
+
+import pytest
+
+import uproot
+import uproot.writing
+
+
+def test_write_empty(tmp_path):
+    filename = os.path.join(tmp_path, "whatever.root")
+
+    tlist = uproot.writing.identify.to_TList([])
+
+    with uproot.recreate(filename) as f:
+        f["test"] = tlist
+
+    with uproot.open(filename) as f2:
+        assert len(f2["test"]) == 0
+
+
+def test_write_single_key(tmp_path):
+    filename = os.path.join(tmp_path, "whatever.root")
+
+    tlist = uproot.writing.identify.to_TList(
+        [uproot.writing.identify.to_TObjString("test string")]
+    )
+
+    with uproot.recreate(filename) as f:
+        f["test"] = tlist
+
+    with uproot.open(filename) as f2:
+        assert len(f2["test"]) == 1
+
+
+def test_write_nested(tmp_path):
+    filename = os.path.join(tmp_path, "whatever.root")
+
+    tlist_child = uproot.writing.identify.to_TList(
+        [uproot.writing.identify.to_TObjString(s) for s in "this is a test".split()]
+    )
+
+    entries = [
+        uproot.writing.identify.to_TObjString("this string goes in the front"),
+        tlist_child,
+        uproot.writing.identify.to_TObjString("test string"),
+    ]
+
+    tlist = uproot.writing.identify.to_TList(entries)
+
+    with uproot.recreate(filename) as f:
+        f["test"] = tlist
+
+    with uproot.open(filename) as f2:
+        parent_list = f2["test"]
+        assert len(parent_list) == 3
+        assert isinstance(parent_list[0], uproot.models.TObjString.Model_TObjString)
+        assert str(parent_list[0]) == "this string goes in the front"
+        assert str(parent_list[2]) == "test string"
+        child_list = parent_list[1]
+        assert isinstance(child_list, uproot.models.TList.Model_TList)
+        assert len(child_list) == 4
+        assert " ".join([str(s) for s in child_list]) == "this is a test"