From 858c17eec8228831a175d781e19be0855a775427 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 13:40:39 -0500 Subject: [PATCH 01/13] First, get rid of the 'io' submodule. --- src/awkward/_v2/__init__.py | 1 - src/awkward/_v2/operations/convert/__init__.py | 4 ++++ .../{io => convert}/ak_from_json_file.py | 0 .../operations/{io => convert}/ak_to_json_file.py | 0 src/awkward/_v2/operations/io/__init__.py | 4 ---- tests/v2/test_0019-use-json-library.py | 10 ++++++---- tests/v2/test_0437-stream-of-many-json-files.py | 14 ++++++++------ 7 files changed, 18 insertions(+), 15 deletions(-) rename src/awkward/_v2/operations/{io => convert}/ak_from_json_file.py (100%) rename src/awkward/_v2/operations/{io => convert}/ak_to_json_file.py (100%) delete mode 100644 src/awkward/_v2/operations/io/__init__.py diff --git a/src/awkward/_v2/__init__.py b/src/awkward/_v2/__init__.py index ad9ac6f84e..4e25d3139d 100644 --- a/src/awkward/_v2/__init__.py +++ b/src/awkward/_v2/__init__.py @@ -31,7 +31,6 @@ import awkward._v2.behaviors.string # noqa: F401 # operations -from awkward._v2.operations.io import * # noqa: F401, F403 from awkward._v2.operations.convert import * # noqa: F401, F403 from awkward._v2.operations.describe import * # noqa: F401, F403 from awkward._v2.operations.structure import * # noqa: F401, F403 diff --git a/src/awkward/_v2/operations/convert/__init__.py b/src/awkward/_v2/operations/convert/__init__.py index bf0eeaa0fa..cbb1a387cb 100644 --- a/src/awkward/_v2/operations/convert/__init__.py +++ b/src/awkward/_v2/operations/convert/__init__.py @@ -9,10 +9,14 @@ from awkward._v2.operations.convert.ak_from_iter import from_iter # noqa: F401 from awkward._v2.operations.convert.ak_to_list import to_list # noqa: F401 from awkward._v2.operations.convert.ak_from_json import from_json # noqa: F401 +from awkward._v2.operations.convert.ak_from_json_file import ( # noqa: F401 + from_json_file, +) from awkward._v2.operations.convert.ak_from_json_schema import ( # noqa: F401 from_json_schema, ) from awkward._v2.operations.convert.ak_to_json import to_json # noqa: F401 +from awkward._v2.operations.convert.ak_to_json_file import to_json_file # noqa: F401 from awkward._v2.operations.convert.ak_to_layout import to_layout # noqa: F401 from awkward._v2.operations.convert.ak_to_arrow import to_arrow # noqa: F401 from awkward._v2.operations.convert.ak_to_arrow_table import ( # noqa: F401 diff --git a/src/awkward/_v2/operations/io/ak_from_json_file.py b/src/awkward/_v2/operations/convert/ak_from_json_file.py similarity index 100% rename from src/awkward/_v2/operations/io/ak_from_json_file.py rename to src/awkward/_v2/operations/convert/ak_from_json_file.py diff --git a/src/awkward/_v2/operations/io/ak_to_json_file.py b/src/awkward/_v2/operations/convert/ak_to_json_file.py similarity index 100% rename from src/awkward/_v2/operations/io/ak_to_json_file.py rename to src/awkward/_v2/operations/convert/ak_to_json_file.py diff --git a/src/awkward/_v2/operations/io/__init__.py b/src/awkward/_v2/operations/io/__init__.py deleted file mode 100644 index 83a623c80c..0000000000 --- a/src/awkward/_v2/operations/io/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE - -from awkward._v2.operations.io.ak_from_json_file import from_json_file # noqa: F401 -from awkward._v2.operations.io.ak_to_json_file import to_json_file # noqa: F401 diff --git a/tests/v2/test_0019-use-json-library.py b/tests/v2/test_0019-use-json-library.py index 5ab0bbdff8..932c018f9c 100644 --- a/tests/v2/test_0019-use-json-library.py +++ b/tests/v2/test_0019-use-json-library.py @@ -32,19 +32,21 @@ def test_fromfile(tmp_path): with open(os.path.join(str(tmp_path), "tmp1.json"), "w") as f: f.write("[[1.1, 2.2, 3], [], [4, 5.5]]") - array = ak._v2.operations.io.from_json_file( + array = ak._v2.operations.convert.from_json_file( os.path.join(str(tmp_path), "tmp1.json") ) assert array.tolist() == [[1.1, 2.2, 3.0], [], [4.0, 5.5]] with pytest.raises(IOError): - ak._v2.operations.io.from_json_file("nonexistent.json") + ak._v2.operations.convert.from_json_file("nonexistent.json") with open(os.path.join(str(tmp_path), "tmp2.json"), "w") as f: f.write("[[1.1, 2.2, 3], []], [4, 5.5]]") with pytest.raises(ValueError): - ak._v2.operations.io.from_json_file(os.path.join(str(tmp_path), "tmp2.json")) + ak._v2.operations.convert.from_json_file( + os.path.join(str(tmp_path), "tmp2.json") + ) def test_tostring(): @@ -146,7 +148,7 @@ def test_complex_with_nan_and_inf(): def test_tofile(tmp_path): - ak._v2.operations.io.to_json_file( + ak._v2.operations.convert.to_json_file( ak._v2.operations.convert.from_json("[[1.1,2.2,3],[],[4,5.5]]"), os.path.join(str(tmp_path), "tmp1.json"), ) diff --git a/tests/v2/test_0437-stream-of-many-json-files.py b/tests/v2/test_0437-stream-of-many-json-files.py index 2b663a3bc4..9ed28f57c2 100644 --- a/tests/v2/test_0437-stream-of-many-json-files.py +++ b/tests/v2/test_0437-stream-of-many-json-files.py @@ -132,7 +132,7 @@ def test_two_arrays(): array = ak._v2.operations.convert.from_json(str) assert array.tolist() == ["one", "two"] - array = ak._v2.operations.io.from_json_file( + array = ak._v2.operations.convert.from_json_file( os.path.join(path, "samples/test-two-arrays.json") ) assert array.tolist() == [ @@ -286,7 +286,7 @@ def test_array_tojson(): def test_fromfile(): # read multiple json fragments from a json file - array = ak._v2.operations.io.from_json_file( + array = ak._v2.operations.convert.from_json_file( os.path.join(path, "samples/test-record-array.json") ) assert array.tolist() == [ @@ -300,7 +300,7 @@ def test_fromfile(): # read json file containing 'nan' and 'inf' user-defined strings # and replace 'nan' and 'inf' strings with floats - array = ak._v2.operations.io.from_json_file( + array = ak._v2.operations.convert.from_json_file( os.path.join(path, "samples/test.json"), infinity_string="inf", minus_infinity_string="-inf", @@ -363,7 +363,9 @@ def test_fromfile(): ] # read json file containing 'nan' and 'inf' user-defined strings - array = ak._v2.operations.io.from_json_file(os.path.join(path, "samples/test.json")) + array = ak._v2.operations.convert.from_json_file( + os.path.join(path, "samples/test.json") + ) assert array.tolist() == [ 1.1, @@ -423,7 +425,7 @@ def test_fromfile(): # read json file containing 'nan' and 'inf' user-defined strings # and replace 'nan' and 'inf' strings with a predefined 'None' string - array = ak._v2.operations.io.from_json_file( + array = ak._v2.operations.convert.from_json_file( os.path.join(path, "samples/test.json"), infinity_string="inf", minus_infinity_string="-inf", @@ -499,7 +501,7 @@ def fix(obj): # read json file containing multiple definitions of 'nan' and 'inf' # user-defined strings # replace can only work for one string definition - array = ak._v2.operations.io.from_json_file( + array = ak._v2.operations.convert.from_json_file( os.path.join(path, "samples/test-nan-inf.json"), infinity_string="Infinity", nan_string="None at all", From 69fb47da49bae70de0b93a945818204ecca9d3bf Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 15:32:49 -0500 Subject: [PATCH 02/13] Put all of the ._to_json functionality into ._to_list. --- src/awkward/_v2/contents/bitmaskedarray.py | 6 +- src/awkward/_v2/contents/bytemaskedarray.py | 8 +-- src/awkward/_v2/contents/content.py | 66 ++++++++++++++++++- src/awkward/_v2/contents/emptyarray.py | 2 +- src/awkward/_v2/contents/indexedarray.py | 6 +- .../_v2/contents/indexedoptionarray.py | 6 +- src/awkward/_v2/contents/listarray.py | 4 +- src/awkward/_v2/contents/listoffsetarray.py | 6 +- src/awkward/_v2/contents/numpyarray.py | 40 ++++++++++- src/awkward/_v2/contents/recordarray.py | 8 +-- src/awkward/_v2/contents/regulararray.py | 6 +- src/awkward/_v2/contents/unionarray.py | 6 +- src/awkward/_v2/contents/unmaskedarray.py | 6 +- src/awkward/nplike.py | 5 +- 14 files changed, 134 insertions(+), 41 deletions(-) diff --git a/src/awkward/_v2/contents/bitmaskedarray.py b/src/awkward/_v2/contents/bitmaskedarray.py index 35bf48a8cb..a2a3e51f08 100644 --- a/src/awkward/_v2/contents/bitmaskedarray.py +++ b/src/awkward/_v2/contents/bitmaskedarray.py @@ -628,13 +628,13 @@ def packed(self): self._nplike, ) - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out mask = self.mask_as_bool(valid_when=True, nplike=self.nplike)[: self._length] - out = self._content._getitem_range(slice(0, self._length))._to_list(behavior) + out = self._content._getitem_range(slice(0, self._length))._to_list(behavior, json_conversions) for i, isvalid in enumerate(mask): if not isvalid: diff --git a/src/awkward/_v2/contents/bytemaskedarray.py b/src/awkward/_v2/contents/bytemaskedarray.py index 713c93dbb3..5c8fef8a5e 100644 --- a/src/awkward/_v2/contents/bytemaskedarray.py +++ b/src/awkward/_v2/contents/bytemaskedarray.py @@ -1009,13 +1009,13 @@ def packed(self): self._nplike, ) - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out mask = self.mask_as_bool(valid_when=True, nplike=self.nplike) - out = self._content._getitem_range(slice(0, len(mask)))._to_list(behavior) + out = self._content._getitem_range(slice(0, len(mask)))._to_list(behavior, json_conversions) for i, isvalid in enumerate(mask): if not isvalid: @@ -1044,7 +1044,7 @@ def _to_json( complex_real_string, complex_imag_string, ): - out = self._to_list_custom(behavior) + out = self._to_list_custom(behavior, None) if out is not None: return out diff --git a/src/awkward/_v2/contents/content.py b/src/awkward/_v2/contents/content.py index c05d7d2404..384834dde8 100644 --- a/src/awkward/_v2/contents/content.py +++ b/src/awkward/_v2/contents/content.py @@ -1,5 +1,7 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +import numbers +import math import copy from collections.abc import Iterable @@ -1333,15 +1335,75 @@ def tolist(self, behavior=None): return self.to_list(behavior) def to_list(self, behavior=None): - return self.packed()._to_list(behavior) + return self.packed()._to_list(behavior, None) - def _to_list_custom(self, behavior): + def _to_list_custom(self, behavior, json_conversions): cls = ak._v2._util.arrayclass(self, behavior) if cls.__getitem__ is not ak._v2.highlevel.Array.__getitem__: array = cls(self) out = [None] * self.length for i in range(self.length): out[i] = array[i] + + if json_conversions is not None: + outimag = None + complex_real_string = json_conversions["complex_real_string"] + complex_imag_string = json_conversions["complex_imag_string"] + if complex_real_string is not None: + Real = numbers.Real + Complex = numbers.Complex + if any(not isinstance(x, Real) and isinstance(x, Complex) for x in out): + outimag = [None] * len(out) + for i, x in enumerate(out): + out[i] = x.real + outimag[i] = x.imag + + filters = [] + + nan_string = json_conversions["nan_string"] + if nan_string is not None: + isnan = math.isnan + filters.append(lambda x: nan_string if isnan(x) else x) + + infinity_string = json_conversions["infinity_string"] + if infinity_string is not None: + inf = float("inf") + filters.append(lambda x: infinity_string if x == inf else x) + + minus_infinity_string = json_conversions["minus_infinity_string"] + if minus_infinity_string is not None: + minf = float("-inf") + filters.append(lambda x: minus_infinity_string if x == minf else x) + + if len(filters) == 1: + f0 = filters[0] + for i, x in enumerate(out): + out[i] = f0(x) + if outimag is not None: + for i, x in enumerate(outimag): + outimag[i] = f0(x) + elif len(filters) == 2: + f0 = filters[0] + f1 = filters[1] + for i, x in enumerate(out): + out[i] = f1(f0(x)) + if outimag is not None: + for i, x in enumerate(outimag): + outimag[i] = f1(f0(x)) + elif len(filters) == 3: + f0 = filters[0] + f1 = filters[1] + f2 = filters[2] + for i, x in enumerate(out): + out[i] = f2(f1(f0(x))) + if outimag is not None: + for i, x in enumerate(outimag): + outimag[i] = f2(f1(f0(x))) + + if outimag is not None: + for i, (real, imag) in enumerate(zip(out, outimag)): + out[i] = {complex_real_string: real, complex_imag_string: imag} + return out def flatten(self, axis=1, depth=0): diff --git a/src/awkward/_v2/contents/emptyarray.py b/src/awkward/_v2/contents/emptyarray.py index eba1550000..b9b2ee4962 100644 --- a/src/awkward/_v2/contents/emptyarray.py +++ b/src/awkward/_v2/contents/emptyarray.py @@ -334,7 +334,7 @@ def continuation(): def packed(self): return self - def _to_list(self, behavior): + def _to_list(self, behavior, json_conversions): return [] def _to_nplike(self, nplike): diff --git a/src/awkward/_v2/contents/indexedarray.py b/src/awkward/_v2/contents/indexedarray.py index fee6a0f0b1..059ea70b5c 100644 --- a/src/awkward/_v2/contents/indexedarray.py +++ b/src/awkward/_v2/contents/indexedarray.py @@ -1217,14 +1217,14 @@ def continuation(): def packed(self): return self.project().packed() - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out index = self._index.raw(numpy) nextcontent = self._content._carry(ak._v2.index.Index(index), False) - return nextcontent._to_list(behavior) + return nextcontent._to_list(behavior, json_conversions) def _to_nplike(self, nplike): index = self._index._to_nplike(nplike) diff --git a/src/awkward/_v2/contents/indexedoptionarray.py b/src/awkward/_v2/contents/indexedoptionarray.py index 791ddf6fc8..86c3a1ae3b 100644 --- a/src/awkward/_v2/contents/indexedoptionarray.py +++ b/src/awkward/_v2/contents/indexedoptionarray.py @@ -1637,8 +1637,8 @@ def packed(self): self._nplike, ) - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out @@ -1648,7 +1648,7 @@ def _to_list(self, behavior): nextcontent = self._content._carry( ak._v2.index.Index(index[not_missing]), False ) - out = nextcontent._to_list(behavior) + out = nextcontent._to_list(behavior, json_conversions) for i, isvalid in enumerate(not_missing): if not isvalid: diff --git a/src/awkward/_v2/contents/listarray.py b/src/awkward/_v2/contents/listarray.py index 6c8d90c319..21078b3844 100644 --- a/src/awkward/_v2/contents/listarray.py +++ b/src/awkward/_v2/contents/listarray.py @@ -1424,8 +1424,8 @@ def continuation(): def packed(self): return self.toListOffsetArray64(True).packed() - def _to_list(self, behavior): - return ListOffsetArray._to_list(self, behavior) + def _to_list(self, behavior, json_conversions): + return ListOffsetArray._to_list(self, behavior, json_conversions) def _to_nplike(self, nplike): starts = self._starts._to_nplike(nplike) diff --git a/src/awkward/_v2/contents/listoffsetarray.py b/src/awkward/_v2/contents/listoffsetarray.py index 427904ad9c..2727e9de95 100644 --- a/src/awkward/_v2/contents/listoffsetarray.py +++ b/src/awkward/_v2/contents/listoffsetarray.py @@ -2065,7 +2065,7 @@ def packed(self): next._offsets, content, next._identifier, next._parameters, self._nplike ) - def _to_list(self, behavior): + def _to_list(self, behavior, json_conversions): starts, stops = self.starts, self.stops starts_data = starts.raw(numpy) stops_data = stops.raw(numpy)[: len(starts_data)] @@ -2099,11 +2099,11 @@ def _to_list(self, behavior): return out else: - out = self._to_list_custom(behavior) + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out - content = nextcontent._to_list(behavior) + content = nextcontent._to_list(behavior, json_conversions) out = [None] * starts.length for i in range(starts.length): diff --git a/src/awkward/_v2/contents/numpyarray.py b/src/awkward/_v2/contents/numpyarray.py index 43b1b823a7..d598a2134f 100644 --- a/src/awkward/_v2/contents/numpyarray.py +++ b/src/awkward/_v2/contents/numpyarray.py @@ -1295,7 +1295,7 @@ def continuation(): def packed(self): return self.contiguous().toRegularArray() - def _to_list(self, behavior): + def _to_list(self, behavior, json_conversions): if self.parameter("__array__") == "byte": return ak._v2._util.tobytes(self._data) @@ -1303,11 +1303,45 @@ def _to_list(self, behavior): return ak._v2._util.tobytes(self._data).decode(errors="surrogateescape") else: - out = self._to_list_custom(behavior) + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out - return self._data.tolist() + if json_conversions is not None: + complex_real_string = json_conversions["complex_real_string"] + complex_imag_string = json_conversions["complex_imag_string"] + if complex_real_string is not None: + if issubclass(self.dtype.type, np.complexfloating): + return ak._v2.contents.RecordArray( + [ + ak._v2.contents.NumpyArray(self._data.real, nplike=self._nplike), + ak._v2.contents.NumpyArray(self._data.imag, nplike=self._nplike), + ], + [complex_real_string, complex_imag_string], + self.length, + parameters=self._parameters, + nplike=self._nplike, + )._to_list(behavior, json_conversions) + + out = self._data.tolist() + + if json_conversions is not None: + nan_string = json_conversions["nan_string"] + if nan_string is not None: + for i in self._nplike.nonzero(self._nplike.isnan(self._data))[0]: + out[i] = nan_string + + infinity_string = json_conversions["infinity_string"] + if infinity_string is not None: + for i in self._nplike.nonzero(self._data == np.inf)[0]: + out[i] = infinity_string + + minus_infinity_string = json_conversions["minus_infinity_string"] + if minus_infinity_string is not None: + for i in self._nplike.nonzero(self._data == -np.inf)[0]: + out[i] = minus_infinity_string + + return out def _to_nplike(self, nplike): return NumpyArray( diff --git a/src/awkward/_v2/contents/recordarray.py b/src/awkward/_v2/contents/recordarray.py index af0451491c..bbd9dba974 100644 --- a/src/awkward/_v2/contents/recordarray.py +++ b/src/awkward/_v2/contents/recordarray.py @@ -1007,13 +1007,13 @@ def packed(self): self._nplike, ) - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out if self.is_tuple: - contents = [x._to_list(behavior) for x in self._contents] + contents = [x._to_list(behavior, json_conversions) for x in self._contents] length = self._length out = [None] * length for i in range(length): @@ -1022,7 +1022,7 @@ def _to_list(self, behavior): else: fields = self._fields - contents = [x._to_list(behavior) for x in self._contents] + contents = [x._to_list(behavior, json_conversions) for x in self._contents] length = self._length out = [None] * length for i in range(length): diff --git a/src/awkward/_v2/contents/regulararray.py b/src/awkward/_v2/contents/regulararray.py index 94ad099452..fa00a4f656 100644 --- a/src/awkward/_v2/contents/regulararray.py +++ b/src/awkward/_v2/contents/regulararray.py @@ -1212,7 +1212,7 @@ def packed(self): self._nplike, ) - def _to_list(self, behavior): + def _to_list(self, behavior, json_conversions): if self.parameter("__array__") == "bytestring": content = ak._v2._util.tobytes(self._content.data) length, size = self._length, self._size @@ -1232,11 +1232,11 @@ def _to_list(self, behavior): return out else: - out = self._to_list_custom(behavior) + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out - content = self._content._to_list(behavior) + content = self._content._to_list(behavior, json_conversions) length, size = self._length, self._size out = [None] * length for i in range(length): diff --git a/src/awkward/_v2/contents/unionarray.py b/src/awkward/_v2/contents/unionarray.py index 6acbf103cd..b9922e236d 100644 --- a/src/awkward/_v2/contents/unionarray.py +++ b/src/awkward/_v2/contents/unionarray.py @@ -1390,14 +1390,14 @@ def packed(self): self._nplike, ) - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out tags = self._tags.raw(numpy) index = self._index.raw(numpy) - contents = [x._to_list(behavior) for x in self._contents] + contents = [x._to_list(behavior, json_conversions) for x in self._contents] out = [None] * tags.shape[0] for i, tag in enumerate(tags): diff --git a/src/awkward/_v2/contents/unmaskedarray.py b/src/awkward/_v2/contents/unmaskedarray.py index ba03908e4b..4b1533f8b0 100644 --- a/src/awkward/_v2/contents/unmaskedarray.py +++ b/src/awkward/_v2/contents/unmaskedarray.py @@ -543,12 +543,12 @@ def packed(self): self._content.packed(), self._identifier, self._parameters, self._nplike ) - def _to_list(self, behavior): - out = self._to_list_custom(behavior) + def _to_list(self, behavior, json_conversions): + out = self._to_list_custom(behavior, json_conversions) if out is not None: return out - return self._content._to_list(behavior) + return self._content._to_list(behavior, json_conversions) def _to_nplike(self, nplike): content = self._content._to_nplike(nplike) diff --git a/src/awkward/nplike.py b/src/awkward/nplike.py index dd6501dab3..b1294de932 100644 --- a/src/awkward/nplike.py +++ b/src/awkward/nplike.py @@ -77,6 +77,7 @@ class NumpyMetadata(Singleton): signedinteger = numpy.signedinteger unsignedinteger = numpy.unsignedinteger floating = numpy.floating + complexfloating = numpy.complexfloating number = numpy.number object_ = numpy.object_ generic = numpy.generic @@ -114,10 +115,6 @@ class NumpyMetadata(Singleton): if hasattr(numpy, "timedelta64"): NumpyMetadata.timedelta64 = numpy.timedelta64 -NumpyMetadata.all_complex = tuple( - getattr(numpy, x) for x in dir(NumpyMetadata) if x.startswith("complex") -) - class NumpyLike(Singleton): known_data = True From 30403ad86730ff292fda9b1d97f310e6fa9e71a7 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 16:29:12 -0500 Subject: [PATCH 03/13] Tests pass again. --- src/awkward/_v2/contents/content.py | 87 ++++++++++--------- src/awkward/_v2/contents/listoffsetarray.py | 21 +++-- src/awkward/_v2/contents/numpyarray.py | 6 +- src/awkward/_v2/contents/recordarray.py | 4 +- src/awkward/_v2/contents/regulararray.py | 21 +++-- .../_v2/operations/convert/ak_to_json.py | 41 +++++---- .../_v2/operations/convert/ak_to_json_file.py | 51 +++++------ tests/v2/test_0019-use-json-library.py | 2 +- tests/v2/test_0032-replace-dressedtype.py | 2 +- .../v2/test_0652-tests-of-complex-numbers.py | 4 +- 10 files changed, 127 insertions(+), 112 deletions(-) diff --git a/src/awkward/_v2/contents/content.py b/src/awkward/_v2/contents/content.py index 384834dde8..faa8cba32a 100644 --- a/src/awkward/_v2/contents/content.py +++ b/src/awkward/_v2/contents/content.py @@ -1331,6 +1331,38 @@ def recursively_apply( }, ) + def to_json( + self, + nan_string=None, + infinity_string=None, + minus_infinity_string=None, + complex_record_fields=None, + convert_bytes=None, + behavior=None, + ): + if complex_record_fields is None: + complex_real_string = None + complex_imag_string = None + elif ( + isinstance(complex_record_fields, tuple) + and len(complex_record_fields) == 2 + and isinstance(complex_record_fields[0], str) + and isinstance(complex_record_fields[1], str) + ): + complex_real_string, complex_imag_string = complex_record_fields + + return self.packed()._to_list( + behavior, + { + "nan_string": nan_string, + "infinity_string": infinity_string, + "minus_infinity_string": minus_infinity_string, + "complex_real_string": complex_real_string, + "complex_imag_string": complex_imag_string, + "convert_bytes": convert_bytes, + }, + ) + def tolist(self, behavior=None): return self.to_list(behavior) @@ -1346,6 +1378,12 @@ def _to_list_custom(self, behavior, json_conversions): out[i] = array[i] if json_conversions is not None: + convert_bytes = json_conversions["convert_bytes"] + if convert_bytes is not None: + for i, x in enumerate(out): + if isinstance(x, bytes): + out[i] = convert_bytes(x) + outimag = None complex_real_string = json_conversions["complex_real_string"] complex_imag_string = json_conversions["complex_imag_string"] @@ -1355,8 +1393,12 @@ def _to_list_custom(self, behavior, json_conversions): if any(not isinstance(x, Real) and isinstance(x, Complex) for x in out): outimag = [None] * len(out) for i, x in enumerate(out): - out[i] = x.real - outimag[i] = x.imag + if isinstance(x, Complex): + out[i] = x.real + outimag[i] = x.imag + else: + out[i] = x + outimag[i] = None filters = [] @@ -1416,47 +1458,6 @@ def to_backend(self, backend): else: return self._to_nplike(ak._v2._util.regularize_backend(backend)) - def _to_json_custom(self): - cls = ak._v2._util.arrayclass(self, None) - if cls.__getitem__ is not ak._v2.highlevel.Array.__getitem__: - array = cls(self) - out = [None] * self.length - for i in range(self.length): - out[i] = array[i] - return out - - def tojson( - self, - nan_string=None, - infinity_string=None, - minus_infinity_string=None, - complex_real_string=None, - complex_imag_string=None, - ): - return self.to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - - def to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - return self.packed()._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - def withparameter(self, key, value): out = copy.copy(self) diff --git a/src/awkward/_v2/contents/listoffsetarray.py b/src/awkward/_v2/contents/listoffsetarray.py index 2727e9de95..4843f5dd91 100644 --- a/src/awkward/_v2/contents/listoffsetarray.py +++ b/src/awkward/_v2/contents/listoffsetarray.py @@ -2083,19 +2083,28 @@ def _to_list(self, behavior, json_conversions): nextcontent = self._content._getitem_range(slice(mini, maxi)) if self.parameter("__array__") == "bytestring": + convert_bytes = None if json_conversions is None else json_conversions["convert_bytes"] content = ak._v2._util.tobytes(nextcontent.data) out = [None] * starts.length - for i in range(starts.length): - out[i] = content[starts_data[i] : stops_data[i]] + if convert_bytes is None: + for i in range(starts.length): + out[i] = content[starts_data[i] : stops_data[i]] + else: + for i in range(starts.length): + out[i] = convert_bytes(content[starts_data[i] : stops_data[i]]) return out elif self.parameter("__array__") == "string": - content = ak._v2._util.tobytes(nextcontent.data) + data = nextcontent.data + if hasattr(data, "tobytes"): + def tostring(x): + return x.tobytes().decode(errors="surrogateescape") + else: + def tostring(x): + return x.tostring().decode(errors="surrogateescape") out = [None] * starts.length for i in range(starts.length): - out[i] = content[starts_data[i] : stops_data[i]].decode( - errors="surrogateescape" - ) + out[i] = tostring(data[starts_data[i] : stops_data[i]]) return out else: diff --git a/src/awkward/_v2/contents/numpyarray.py b/src/awkward/_v2/contents/numpyarray.py index d598a2134f..76142cbd10 100644 --- a/src/awkward/_v2/contents/numpyarray.py +++ b/src/awkward/_v2/contents/numpyarray.py @@ -1297,7 +1297,11 @@ def packed(self): def _to_list(self, behavior, json_conversions): if self.parameter("__array__") == "byte": - return ak._v2._util.tobytes(self._data) + convert_bytes = None if json_conversions is None else json_conversions["convert_bytes"] + if convert_bytes is None: + return ak._v2._util.tobytes(self._data) + else: + return convert_bytes(ak._v2._util.tobytes(self._data)) elif self.parameter("__array__") == "char": return ak._v2._util.tobytes(self._data).decode(errors="surrogateescape") diff --git a/src/awkward/_v2/contents/recordarray.py b/src/awkward/_v2/contents/recordarray.py index bbd9dba974..0b19629363 100644 --- a/src/awkward/_v2/contents/recordarray.py +++ b/src/awkward/_v2/contents/recordarray.py @@ -1012,7 +1012,7 @@ def _to_list(self, behavior, json_conversions): if out is not None: return out - if self.is_tuple: + if self.is_tuple and json_conversions is None: contents = [x._to_list(behavior, json_conversions) for x in self._contents] length = self._length out = [None] * length @@ -1022,6 +1022,8 @@ def _to_list(self, behavior, json_conversions): else: fields = self._fields + if fields is None: + fields = [str(i) for i in range(len(self._contents))] contents = [x._to_list(behavior, json_conversions) for x in self._contents] length = self._length out = [None] * length diff --git a/src/awkward/_v2/contents/regulararray.py b/src/awkward/_v2/contents/regulararray.py index fa00a4f656..6db5a07f35 100644 --- a/src/awkward/_v2/contents/regulararray.py +++ b/src/awkward/_v2/contents/regulararray.py @@ -1214,21 +1214,30 @@ def packed(self): def _to_list(self, behavior, json_conversions): if self.parameter("__array__") == "bytestring": + convert_bytes = None if json_conversions is None else json_conversions["convert_bytes"] content = ak._v2._util.tobytes(self._content.data) length, size = self._length, self._size out = [None] * length - for i in range(length): - out[i] = content[(i) * size : (i + 1) * size] + if convert_bytes is None: + for i in range(length): + out[i] = content[(i) * size : (i + 1) * size] + else: + for i in range(length): + out[i] = convert_bytes(content[(i) * size : (i + 1) * size]) return out elif self.parameter("__array__") == "string": - content = ak._v2._util.tobytes(self._content.data) + data = self._content.data + if hasattr(data, "tobytes"): + def tostring(x): + return x.tobytes().decode(errors="surrogateescape") + else: + def tostring(x): + return x.tostring().decode(errors="surrogateescape") length, size = self._length, self._size out = [None] * length for i in range(length): - out[i] = content[(i) * size : (i + 1) * size].decode( - errors="surrogateescape" - ) + out[i] = tostring(data[(i) * size : (i + 1) * size]) return out else: diff --git a/src/awkward/_v2/operations/convert/ak_to_json.py b/src/awkward/_v2/operations/convert/ak_to_json.py index 7bf56149db..e7e8161bb7 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json.py +++ b/src/awkward/_v2/operations/convert/ak_to_json.py @@ -18,6 +18,7 @@ def to_json( infinity_string=None, minus_infinity_string=None, complex_record_fields=None, + convert_bytes=None, ): """ Args: @@ -35,6 +36,9 @@ def to_json( number. complex_record_fields (None or (str, str)): If not None, defines a pair of field names to interpret records as complex numbers. + convert_bytes (None or function): If not None, this function is applied to + all Python 3 bytes objects to produce something JSON serializable, + such as a string using UTF-8 or Base-64 encoding, lists of integers, etc. Converts `array` (many types supported, including all Awkward Arrays and Records) into a JSON string. @@ -53,7 +57,7 @@ def to_json( * #ak.types.RecordArray with field names: converted into JSON objects. * #ak.types.UnionArray: JSON data are naturally heterogeneous. - See also #ak.from_json and #ak.Array.tojson. + See also #ak.from_json. """ with ak._v2._util.OperationErrorContext( "ak._v2.to_json", @@ -66,6 +70,7 @@ def to_json( infinity_string=infinity_string, minus_infinity_string=minus_infinity_string, complex_record_fields=complex_record_fields, + convert_bytes=convert_bytes, ), ): return _impl( @@ -77,6 +82,7 @@ def to_json( infinity_string, minus_infinity_string, complex_record_fields, + convert_bytes, ) @@ -89,6 +95,7 @@ def _impl( infinity_string, minus_infinity_string, complex_record_fields, + convert_bytes, ): if array is None or isinstance(array, (bool, str, bytes, Number)): return json.dumps(array) @@ -128,24 +135,16 @@ def _impl( else: raise ak._v2._util.error(TypeError(f"unrecognized array type: {repr(array)}")) - if complex_record_fields is None: - complex_real_string = None - complex_imag_string = None - elif ( - isinstance(complex_record_fields, tuple) - and len(complex_record_fields) == 2 - and isinstance(complex_record_fields[0], str) - and isinstance(complex_record_fields[1], str) - ): - complex_real_string, complex_imag_string = complex_record_fields - - return json.dumps( - out.tojson( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ), - separators=(",", ":"), + jsondata = out.to_json( + nan_string=nan_string, + infinity_string=infinity_string, + minus_infinity_string=minus_infinity_string, + complex_record_fields=complex_record_fields, + convert_bytes=convert_bytes, + behavior=ak._v2._util.behavior_of(array), ) + + try: + return json.dumps(jsondata, separators=(",", ":")) + except Exception as err: + raise ak._v2._util.error(err) diff --git a/src/awkward/_v2/operations/convert/ak_to_json_file.py b/src/awkward/_v2/operations/convert/ak_to_json_file.py index eb6f6d8ce1..fd636eb31a 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json_file.py +++ b/src/awkward/_v2/operations/convert/ak_to_json_file.py @@ -16,7 +16,7 @@ def to_json_file( infinity_string=None, minus_infinity_string=None, complex_record_fields=None, - buffersize=65536, + convert_bytes=None, ): """ Args: @@ -36,8 +36,9 @@ def to_json_file( number. complex_record_fields (None or (str, str)): If not None, defines a pair of field names to interpret records as complex numbers. - buffersize (int): Size (in bytes) of the buffer used by the JSON - parser. + convert_bytes (None or function): If not None, this function is applied to + all Python 3 bytes objects to produce something JSON serializable, + such as a string using UTF-8 or Base-64 encoding, lists of integers, etc. Converts `array` (many types supported, including all Awkward Arrays and Records) into a JSON file. @@ -56,7 +57,7 @@ def to_json_file( * #ak.types.RecordArray with field names: converted into JSON objects. * #ak.types.UnionArray: JSON data are naturally heterogeneous. - See also #ak.from_json and #ak.Array.tojson. + See also #ak.from_json. """ with ak._v2._util.OperationErrorContext( "ak._v2.to_json_file", @@ -69,7 +70,7 @@ def to_json_file( infinity_string=infinity_string, minus_infinity_string=minus_infinity_string, complex_record_fields=complex_record_fields, - buffersize=buffersize, + convert_bytes=convert_bytes, ), ): return _impl( @@ -81,7 +82,7 @@ def to_json_file( infinity_string, minus_infinity_string, complex_record_fields, - buffersize, + convert_bytes, ) @@ -94,7 +95,7 @@ def _impl( infinity_string, minus_infinity_string, complex_record_fields, - buffersize, + convert_bytes, ): if array is None or isinstance(array, (bool, str, bytes, Number)): return json.dump(array) @@ -134,26 +135,16 @@ def _impl( else: raise ak._v2._util.error(TypeError(f"unrecognized array type: {repr(array)}")) - if complex_record_fields is None: - complex_real_string = None - complex_imag_string = None - elif ( - isinstance(complex_record_fields, tuple) - and len(complex_record_fields) == 2 - and isinstance(complex_record_fields[0], str) - and isinstance(complex_record_fields[1], str) - ): - complex_real_string, complex_imag_string = complex_record_fields - - with open(destination, "w", encoding="utf-8") as f: - for chunk in json.dumps( - out.tojson( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ), - separators=(",", ":"), - ): - f.write(chunk) + with open(destination, "w", encoding="utf-8") as file: + jsondata = out.to_json( + nan_string=nan_string, + infinity_string=infinity_string, + minus_infinity_string=minus_infinity_string, + complex_record_fields=complex_record_fields, + convert_bytes=convert_bytes, + behavior=ak._v2._util.behavior_of(array), + ) + try: + json.dump(jsondata, file, separators=(",", ":")) + except Exception as err: + raise ak._v2._util.error(err) diff --git a/tests/v2/test_0019-use-json-library.py b/tests/v2/test_0019-use-json-library.py index 932c018f9c..5ac0ea944d 100644 --- a/tests/v2/test_0019-use-json-library.py +++ b/tests/v2/test_0019-use-json-library.py @@ -85,7 +85,7 @@ def test_bytearray(): array = ak._v2.contents.NumpyArray( np.frombuffer(b"hellothere", "u1"), parameters={"__array__": "byte"} ) - assert ak._v2.operations.convert.to_json(array) == '"hellothere"' + assert ak._v2.operations.convert.to_json(array, convert_bytes=bytes.decode) == '"hellothere"' def test_complex(): diff --git a/tests/v2/test_0032-replace-dressedtype.py b/tests/v2/test_0032-replace-dressedtype.py index 22737194e8..e261a4c17e 100644 --- a/tests/v2/test_0032-replace-dressedtype.py +++ b/tests/v2/test_0032-replace-dressedtype.py @@ -124,7 +124,7 @@ def test_builder_string(): a = builder.snapshot() assert str(a) == "[b'one', b'two', b'three']" assert to_list(a) == [b"one", b"two", b"three"] - assert ak._v2.operations.convert.to_json(a) == '["one","two","three"]' + assert ak._v2.operations.convert.to_json(a, convert_bytes=bytes.decode) == '["one","two","three"]' # assert repr(a) == "" assert str(ak._v2.operations.describe.type(a)) == "3 * bytes" diff --git a/tests/v2/test_0652-tests-of-complex-numbers.py b/tests/v2/test_0652-tests-of-complex-numbers.py index 402edda981..8672808d22 100644 --- a/tests/v2/test_0652-tests-of-complex-numbers.py +++ b/tests/v2/test_0652-tests-of-complex-numbers.py @@ -108,11 +108,11 @@ def test_to_json(): # Complex numbers can't be converted to JSON without setting 'complex_record_fields', # but the error messages should refer to that name now. (I changed the name at # high-level, but not in the error messages emitted by C++ code.) - with pytest.raises(ValueError) as err: + with pytest.raises(TypeError) as err: ak._v2.operations.convert.to_json( ak._v2.operations.convert.from_iter([1 + 1j, 2 + 2j, 3 + 3j]) ) - assert "needs both" not in str(err) + assert "type complex is not JSON serializable" in str(err) expectation = [{"r": 1.0, "i": 1.0}, {"r": 2.0, "i": 2.0}, {"r": 3.0, "i": 3.0}] assert expectation == json.loads( From f70703adf55b83c7f4902c9c67a86fd9670c4c66 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 16:32:53 -0500 Subject: [PATCH 04/13] Remove old '_to_json' methods, which are now dead code. --- src/awkward/_v2/contents/bitmaskedarray.py | 26 ------ src/awkward/_v2/contents/bytemaskedarray.py | 28 ------- src/awkward/_v2/contents/emptyarray.py | 10 --- src/awkward/_v2/contents/indexedarray.py | 25 ------ .../_v2/contents/indexedoptionarray.py | 26 ------ src/awkward/_v2/contents/listarray.py | 17 ---- src/awkward/_v2/contents/listoffsetarray.py | 38 --------- src/awkward/_v2/contents/numpyarray.py | 79 ------------------- src/awkward/_v2/contents/recordarray.py | 58 -------------- src/awkward/_v2/contents/regulararray.py | 39 --------- src/awkward/_v2/contents/unionarray.py | 30 ------- src/awkward/_v2/contents/unmaskedarray.py | 20 ----- 12 files changed, 396 deletions(-) diff --git a/src/awkward/_v2/contents/bitmaskedarray.py b/src/awkward/_v2/contents/bitmaskedarray.py index a2a3e51f08..0ee3ff760e 100644 --- a/src/awkward/_v2/contents/bitmaskedarray.py +++ b/src/awkward/_v2/contents/bitmaskedarray.py @@ -655,29 +655,3 @@ def _to_nplike(self, nplike): parameters=self._parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_json_custom() - if out is not None: - return out - - mask = self.mask_as_bool(valid_when=True, nplike=self.nplike)[: self._length] - content = self._content._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - out = [None] * self._length - for i, isvalid in enumerate(mask): - if isvalid: - out[i] = content[i] - return out diff --git a/src/awkward/_v2/contents/bytemaskedarray.py b/src/awkward/_v2/contents/bytemaskedarray.py index 5c8fef8a5e..c88ce6e2fe 100644 --- a/src/awkward/_v2/contents/bytemaskedarray.py +++ b/src/awkward/_v2/contents/bytemaskedarray.py @@ -1034,31 +1034,3 @@ def _to_nplike(self, nplike): parameters=self._parameters, nplike=nplike, ) - - def _to_json( - self, - behavior, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_list_custom(behavior, None) - if out is not None: - return out - - mask = self.mask_as_bool(valid_when=True, nplike=self.nplike) - content = self._content._to_json( - behavior, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - out = [None] * self._mask.length - for i, isvalid in enumerate(mask): - if isvalid: - out[i] = content[i] - return out diff --git a/src/awkward/_v2/contents/emptyarray.py b/src/awkward/_v2/contents/emptyarray.py index b9b2ee4962..7a576820bd 100644 --- a/src/awkward/_v2/contents/emptyarray.py +++ b/src/awkward/_v2/contents/emptyarray.py @@ -339,13 +339,3 @@ def _to_list(self, behavior, json_conversions): def _to_nplike(self, nplike): return EmptyArray(self._identifier, self._parameters, nplike=nplike) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - return [] diff --git a/src/awkward/_v2/contents/indexedarray.py b/src/awkward/_v2/contents/indexedarray.py index 059ea70b5c..d5166f7422 100644 --- a/src/awkward/_v2/contents/indexedarray.py +++ b/src/awkward/_v2/contents/indexedarray.py @@ -1236,28 +1236,3 @@ def _to_nplike(self, nplike): parameters=self.parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_json_custom() - if out is not None: - return out - - index = self._index.raw(numpy) - content = self._content._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - out = [None] * index.length - for i, ind in enumerate(index): - out[i] = content[ind] - return out diff --git a/src/awkward/_v2/contents/indexedoptionarray.py b/src/awkward/_v2/contents/indexedoptionarray.py index 86c3a1ae3b..b0d6dda222 100644 --- a/src/awkward/_v2/contents/indexedoptionarray.py +++ b/src/awkward/_v2/contents/indexedoptionarray.py @@ -1662,29 +1662,3 @@ def _to_nplike(self, nplike): return IndexedOptionArray( index, content, self.identifier, self.parameters, nplike=nplike ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_json_custom() - if out is not None: - return out - - index = self._index.raw(numpy) - content = self._content._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - out = [None] * len(index) - for i, ind in enumerate(index): - if ind >= 0: - out[i] = content[ind] - return out diff --git a/src/awkward/_v2/contents/listarray.py b/src/awkward/_v2/contents/listarray.py index 21078b3844..b3e5a97af4 100644 --- a/src/awkward/_v2/contents/listarray.py +++ b/src/awkward/_v2/contents/listarray.py @@ -1439,20 +1439,3 @@ def _to_nplike(self, nplike): parameters=self._parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - return ListOffsetArray._to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) diff --git a/src/awkward/_v2/contents/listoffsetarray.py b/src/awkward/_v2/contents/listoffsetarray.py index 4843f5dd91..cacc63f386 100644 --- a/src/awkward/_v2/contents/listoffsetarray.py +++ b/src/awkward/_v2/contents/listoffsetarray.py @@ -2130,44 +2130,6 @@ def _to_nplike(self, nplike): nplike=nplike, ) - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - if ( - self.parameter("__array__") == "bytestring" - or self.parameter("__array__") == "string" - ): - content = ak._v2._util.tobytes(self._content.data) - starts, stops = self.starts, self.stops - out = [None] * starts.length - for i in range(starts.length): - out[i] = content[starts[i] : stops[i]].decode(errors="surrogateescape") - return out - - else: - out = self._to_json_custom() - if out is not None: - return out - - content = self._content._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - starts, stops = self.starts, self.stops - out = [None] * starts.length - - for i in range(starts.length): - out[i] = content[starts[i] : stops[i]] - return out - def _awkward_strings_to_nonfinite(self, nonfinit_dict): if self.parameter("__array__") == "string": strings = self.to_list() diff --git a/src/awkward/_v2/contents/numpyarray.py b/src/awkward/_v2/contents/numpyarray.py index 76142cbd10..2ec0064481 100644 --- a/src/awkward/_v2/contents/numpyarray.py +++ b/src/awkward/_v2/contents/numpyarray.py @@ -1355,85 +1355,6 @@ def _to_nplike(self, nplike): nplike=nplike, ) - def _to_json_custom( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - cls = ak._v2._util.arrayclass(self, None) - if cls.__getitem__ is not ak._v2.highlevel.Array.__getitem__: - array = cls(self) - out = [None] * self.length - for i in range(self.length): - out[i] = array[i] - return out - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - if ( - self.parameter("__array__") == "byte" - or self.parameter("__array__") == "char" - ): - return ak._v2._util.tobytes(self._data).decode(errors="surrogateescape") - - else: - if self.dtype == np.complex128: - if complex_real_string is None or complex_imag_string is None: - raise ak._v2._util.error( - ValueError( - "Complex numbers can't be converted to JSON without" - " setting 'complex_record_fields' " - ) - ) - - return ak._v2.operations.structure.zip( - { - complex_real_string: ak._v2.contents.NumpyArray( - self._data.real - ), - complex_imag_string: ak._v2.contents.NumpyArray( - self._data.imag - ), - } - ).layout._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - - if ( - nan_string is not None - or infinity_string is not None - or minus_infinity_string is not None - ): - out = self._nonfinite_to_union( - nan_string, infinity_string, minus_infinity_string - ) - return out.tolist() - - out = self._to_json_custom( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - if out is not None: - return out - - return self._data.tolist() - def __deepcopy__(self, memo=None): return ak._v2.contents.NumpyArray( copy.deepcopy(self._data), diff --git a/src/awkward/_v2/contents/recordarray.py b/src/awkward/_v2/contents/recordarray.py index 0b19629363..66b73ae088 100644 --- a/src/awkward/_v2/contents/recordarray.py +++ b/src/awkward/_v2/contents/recordarray.py @@ -1041,61 +1041,3 @@ def _to_nplike(self, nplike): parameters=self._parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_json_custom() - if out is not None: - return out - - cls = ak._v2._util.recordclass(self, None) - if cls is not ak._v2.highlevel.Record: - length = self._length - out = [None] * length - for i in range(length): - out[i] = cls(self[i]) - return out - - if self.is_tuple: - contents = [ - x._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - for x in self._contents - ] - length = self._length - out = [None] * length - fields = [] - for i in range(length): - fields.append(str(i)) - for i in range(length): - out[i] = dict(zip(fields, [x[i] for x in contents])) - return out - - else: - fields = self._fields - contents = [ - x._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - for x in self._contents - ] - length = self._length - out = [None] * length - for i in range(length): - out[i] = dict(zip(fields, [x[i] for x in contents])) - return out diff --git a/src/awkward/_v2/contents/regulararray.py b/src/awkward/_v2/contents/regulararray.py index 6db5a07f35..b6883ad41b 100644 --- a/src/awkward/_v2/contents/regulararray.py +++ b/src/awkward/_v2/contents/regulararray.py @@ -1262,42 +1262,3 @@ def _to_nplike(self, nplike): parameters=self.parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - if ( - self.parameter("__array__") == "bytestring" - or self.parameter("__array__") == "string" - ): - content = ak._v2._util.tobytes(self._content.data) - length, size = self._length, self._size - out = [None] * length - for i in range(length): - out[i] = content[(i) * size : (i + 1) * size].decode( - errors="surrogateescape" - ) - return out - - else: - out = self._to_json_custom() - if out is not None: - return out - - content = self._content._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - length, size = self._length, self._size - out = [None] * length - for i in range(length): - out[i] = content[(i) * size : (i + 1) * size] - return out diff --git a/src/awkward/_v2/contents/unionarray.py b/src/awkward/_v2/contents/unionarray.py index b9922e236d..11059ded5e 100644 --- a/src/awkward/_v2/contents/unionarray.py +++ b/src/awkward/_v2/contents/unionarray.py @@ -1415,33 +1415,3 @@ def _to_nplike(self, nplike): parameters=self.parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_json_custom() - if out is not None: - return out - - tags = self._tags.raw(numpy) - index = self._index.raw(numpy) - contents = [ - x._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) - for x in self._contents - ] - - out = [None] * tags.shape[0] - for i, tag in enumerate(tags): - out[i] = contents[tag][index[i]] - return out diff --git a/src/awkward/_v2/contents/unmaskedarray.py b/src/awkward/_v2/contents/unmaskedarray.py index 4b1533f8b0..4d979220c6 100644 --- a/src/awkward/_v2/contents/unmaskedarray.py +++ b/src/awkward/_v2/contents/unmaskedarray.py @@ -558,23 +558,3 @@ def _to_nplike(self, nplike): parameters=self.parameters, nplike=nplike, ) - - def _to_json( - self, - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ): - out = self._to_json_custom() - if out is not None: - return out - - return self._content._to_json( - nan_string, - infinity_string, - minus_infinity_string, - complex_real_string, - complex_imag_string, - ) From 8432a10c557f50350d851a7c9ee12ac032ee8ae4 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 16:34:35 -0500 Subject: [PATCH 05/13] Black formating. --- src/awkward/_v2/contents/bitmaskedarray.py | 4 +++- src/awkward/_v2/contents/bytemaskedarray.py | 4 +++- src/awkward/_v2/contents/content.py | 4 +++- src/awkward/_v2/contents/listoffsetarray.py | 8 +++++++- src/awkward/_v2/contents/numpyarray.py | 12 +++++++++--- src/awkward/_v2/contents/regulararray.py | 8 +++++++- tests/v2/test_0019-use-json-library.py | 5 ++++- tests/v2/test_0032-replace-dressedtype.py | 5 ++++- 8 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/awkward/_v2/contents/bitmaskedarray.py b/src/awkward/_v2/contents/bitmaskedarray.py index 0ee3ff760e..f2514c2f22 100644 --- a/src/awkward/_v2/contents/bitmaskedarray.py +++ b/src/awkward/_v2/contents/bitmaskedarray.py @@ -634,7 +634,9 @@ def _to_list(self, behavior, json_conversions): return out mask = self.mask_as_bool(valid_when=True, nplike=self.nplike)[: self._length] - out = self._content._getitem_range(slice(0, self._length))._to_list(behavior, json_conversions) + out = self._content._getitem_range(slice(0, self._length))._to_list( + behavior, json_conversions + ) for i, isvalid in enumerate(mask): if not isvalid: diff --git a/src/awkward/_v2/contents/bytemaskedarray.py b/src/awkward/_v2/contents/bytemaskedarray.py index c88ce6e2fe..f76760a3e8 100644 --- a/src/awkward/_v2/contents/bytemaskedarray.py +++ b/src/awkward/_v2/contents/bytemaskedarray.py @@ -1015,7 +1015,9 @@ def _to_list(self, behavior, json_conversions): return out mask = self.mask_as_bool(valid_when=True, nplike=self.nplike) - out = self._content._getitem_range(slice(0, len(mask)))._to_list(behavior, json_conversions) + out = self._content._getitem_range(slice(0, len(mask)))._to_list( + behavior, json_conversions + ) for i, isvalid in enumerate(mask): if not isvalid: diff --git a/src/awkward/_v2/contents/content.py b/src/awkward/_v2/contents/content.py index faa8cba32a..c4feef4068 100644 --- a/src/awkward/_v2/contents/content.py +++ b/src/awkward/_v2/contents/content.py @@ -1390,7 +1390,9 @@ def _to_list_custom(self, behavior, json_conversions): if complex_real_string is not None: Real = numbers.Real Complex = numbers.Complex - if any(not isinstance(x, Real) and isinstance(x, Complex) for x in out): + if any( + not isinstance(x, Real) and isinstance(x, Complex) for x in out + ): outimag = [None] * len(out) for i, x in enumerate(out): if isinstance(x, Complex): diff --git a/src/awkward/_v2/contents/listoffsetarray.py b/src/awkward/_v2/contents/listoffsetarray.py index cacc63f386..d11940b460 100644 --- a/src/awkward/_v2/contents/listoffsetarray.py +++ b/src/awkward/_v2/contents/listoffsetarray.py @@ -2083,7 +2083,9 @@ def _to_list(self, behavior, json_conversions): nextcontent = self._content._getitem_range(slice(mini, maxi)) if self.parameter("__array__") == "bytestring": - convert_bytes = None if json_conversions is None else json_conversions["convert_bytes"] + convert_bytes = ( + None if json_conversions is None else json_conversions["convert_bytes"] + ) content = ak._v2._util.tobytes(nextcontent.data) out = [None] * starts.length if convert_bytes is None: @@ -2097,11 +2099,15 @@ def _to_list(self, behavior, json_conversions): elif self.parameter("__array__") == "string": data = nextcontent.data if hasattr(data, "tobytes"): + def tostring(x): return x.tobytes().decode(errors="surrogateescape") + else: + def tostring(x): return x.tostring().decode(errors="surrogateescape") + out = [None] * starts.length for i in range(starts.length): out[i] = tostring(data[starts_data[i] : stops_data[i]]) diff --git a/src/awkward/_v2/contents/numpyarray.py b/src/awkward/_v2/contents/numpyarray.py index 2ec0064481..22122796f1 100644 --- a/src/awkward/_v2/contents/numpyarray.py +++ b/src/awkward/_v2/contents/numpyarray.py @@ -1297,7 +1297,9 @@ def packed(self): def _to_list(self, behavior, json_conversions): if self.parameter("__array__") == "byte": - convert_bytes = None if json_conversions is None else json_conversions["convert_bytes"] + convert_bytes = ( + None if json_conversions is None else json_conversions["convert_bytes"] + ) if convert_bytes is None: return ak._v2._util.tobytes(self._data) else: @@ -1318,8 +1320,12 @@ def _to_list(self, behavior, json_conversions): if issubclass(self.dtype.type, np.complexfloating): return ak._v2.contents.RecordArray( [ - ak._v2.contents.NumpyArray(self._data.real, nplike=self._nplike), - ak._v2.contents.NumpyArray(self._data.imag, nplike=self._nplike), + ak._v2.contents.NumpyArray( + self._data.real, nplike=self._nplike + ), + ak._v2.contents.NumpyArray( + self._data.imag, nplike=self._nplike + ), ], [complex_real_string, complex_imag_string], self.length, diff --git a/src/awkward/_v2/contents/regulararray.py b/src/awkward/_v2/contents/regulararray.py index b6883ad41b..28d435fa95 100644 --- a/src/awkward/_v2/contents/regulararray.py +++ b/src/awkward/_v2/contents/regulararray.py @@ -1214,7 +1214,9 @@ def packed(self): def _to_list(self, behavior, json_conversions): if self.parameter("__array__") == "bytestring": - convert_bytes = None if json_conversions is None else json_conversions["convert_bytes"] + convert_bytes = ( + None if json_conversions is None else json_conversions["convert_bytes"] + ) content = ak._v2._util.tobytes(self._content.data) length, size = self._length, self._size out = [None] * length @@ -1229,11 +1231,15 @@ def _to_list(self, behavior, json_conversions): elif self.parameter("__array__") == "string": data = self._content.data if hasattr(data, "tobytes"): + def tostring(x): return x.tobytes().decode(errors="surrogateescape") + else: + def tostring(x): return x.tostring().decode(errors="surrogateescape") + length, size = self._length, self._size out = [None] * length for i in range(length): diff --git a/tests/v2/test_0019-use-json-library.py b/tests/v2/test_0019-use-json-library.py index 5ac0ea944d..8e7c839975 100644 --- a/tests/v2/test_0019-use-json-library.py +++ b/tests/v2/test_0019-use-json-library.py @@ -85,7 +85,10 @@ def test_bytearray(): array = ak._v2.contents.NumpyArray( np.frombuffer(b"hellothere", "u1"), parameters={"__array__": "byte"} ) - assert ak._v2.operations.convert.to_json(array, convert_bytes=bytes.decode) == '"hellothere"' + assert ( + ak._v2.operations.convert.to_json(array, convert_bytes=bytes.decode) + == '"hellothere"' + ) def test_complex(): diff --git a/tests/v2/test_0032-replace-dressedtype.py b/tests/v2/test_0032-replace-dressedtype.py index e261a4c17e..9bf15394a8 100644 --- a/tests/v2/test_0032-replace-dressedtype.py +++ b/tests/v2/test_0032-replace-dressedtype.py @@ -124,7 +124,10 @@ def test_builder_string(): a = builder.snapshot() assert str(a) == "[b'one', b'two', b'three']" assert to_list(a) == [b"one", b"two", b"three"] - assert ak._v2.operations.convert.to_json(a, convert_bytes=bytes.decode) == '["one","two","three"]' + assert ( + ak._v2.operations.convert.to_json(a, convert_bytes=bytes.decode) + == '["one","two","three"]' + ) # assert repr(a) == "" assert str(ak._v2.operations.describe.type(a)) == "3 * bytes" From 2d0b91e1b6ef6e613f4639910c4bba67659648bf Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 16:49:34 -0500 Subject: [PATCH 06/13] Easier error string match. --- tests/v2/test_0652-tests-of-complex-numbers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v2/test_0652-tests-of-complex-numbers.py b/tests/v2/test_0652-tests-of-complex-numbers.py index 8672808d22..2cdbad4d66 100644 --- a/tests/v2/test_0652-tests-of-complex-numbers.py +++ b/tests/v2/test_0652-tests-of-complex-numbers.py @@ -112,7 +112,7 @@ def test_to_json(): ak._v2.operations.convert.to_json( ak._v2.operations.convert.from_iter([1 + 1j, 2 + 2j, 3 + 3j]) ) - assert "type complex is not JSON serializable" in str(err) + assert "not JSON serializable" in str(err) expectation = [{"r": 1.0, "i": 1.0}, {"r": 2.0, "i": 2.0}, {"r": 3.0, "i": 3.0}] assert expectation == json.loads( From 88da1e190e463fa51d799a0019453b9679484bc4 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 18:26:01 -0500 Subject: [PATCH 07/13] ak._v2.to_json now takes ak._v2.to_json_file's role, dropping the now-unnecessary function. --- .../_v2/operations/convert/__init__.py | 1 - .../_v2/operations/convert/ak_to_json.py | 205 +++++++++++++++--- .../_v2/operations/convert/ak_to_json_file.py | 150 ------------- src/awkward/_v2/record.py | 2 +- tests/v2/test_0019-use-json-library.py | 12 +- 5 files changed, 184 insertions(+), 186 deletions(-) delete mode 100644 src/awkward/_v2/operations/convert/ak_to_json_file.py diff --git a/src/awkward/_v2/operations/convert/__init__.py b/src/awkward/_v2/operations/convert/__init__.py index cbb1a387cb..c08b3d0daa 100644 --- a/src/awkward/_v2/operations/convert/__init__.py +++ b/src/awkward/_v2/operations/convert/__init__.py @@ -16,7 +16,6 @@ from_json_schema, ) from awkward._v2.operations.convert.ak_to_json import to_json # noqa: F401 -from awkward._v2.operations.convert.ak_to_json_file import to_json_file # noqa: F401 from awkward._v2.operations.convert.ak_to_layout import to_layout # noqa: F401 from awkward._v2.operations.convert.ak_to_arrow import to_arrow # noqa: F401 from awkward._v2.operations.convert.ak_to_arrow_table import ( # noqa: F401 diff --git a/src/awkward/_v2/operations/convert/ak_to_json.py b/src/awkward/_v2/operations/convert/ak_to_json.py index e7e8161bb7..256f02d1c1 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json.py +++ b/src/awkward/_v2/operations/convert/ak_to_json.py @@ -1,32 +1,48 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE -import awkward as ak import json +from urllib.parse import urlparse from numbers import Number -np = ak.nplike.NumpyMetadata.instance() +import awkward as ak -# FIXME: 'pretty', 'verbose', and 'maxdecimals' are not used yet +np = ak.nplike.NumpyMetadata.instance() def to_json( array, - pretty=False, - verbose=False, - maxdecimals=6, + file=None, + line_delimited=False, + num_indent_spaces=None, + num_readability_spaces=0, nan_string=None, infinity_string=None, minus_infinity_string=None, complex_record_fields=None, convert_bytes=None, + convert_other=None, ): """ Args: array: Data to convert to JSON. - pretty (bool): If True, indent the output for human readability; if - False, output compact JSON without spaces. - maxdecimals (None or int): If an int, limit the number of - floating-point decimals to this number; if None, write all digits. + file (None, str, or file-like object): If None, this function returns a + JSON-encoded string. Otherwise, this function has no return value. + If a string, this function opens a file with that name, writes JSON + data, and closes the file. If that string has a URI protocol (like + "https://" or "s3://"), this function attempts to open the file with + the fsspec library. If a file-like object with a `write` method, + this function writes to the object, but does not close it. + line_delimited (bool or str): If False, a single JSON document is written, + representing the entire array or record. If True, each element of the + array (or just the one record) is written on a separate line of text, + separated by `"\n"`. If a string, such as `"\r\n"`, it is taken as a + custom line delimiter. (Use `os.linesep` for a platform-dependent + line delimiter.) + num_indent_spaces (None or nonnegative int): Number of spaces to indent nested + elements, for pretty-printed JSON. If None, the JSON output is written + on one line of text. Ignored if `line_delimited` is True or a string. + num_readability_spaces (nonnegative int): Number of spaces to include after + commas (`,`) and colons (`:`), for pretty-printed JSON. nan_string (None or str): If not None, floating-point NaN values will be replaced with this string instead of a JSON number. infinity_string (None or str): If not None, floating-point positive infinity @@ -38,11 +54,27 @@ def to_json( field names to interpret records as complex numbers. convert_bytes (None or function): If not None, this function is applied to all Python 3 bytes objects to produce something JSON serializable, - such as a string using UTF-8 or Base-64 encoding, lists of integers, etc. + such as a string using UTF-8 or Base64 encoding, lists of integers, etc. + convert_other (None or function): Passed to `json.dump` or `json.dumps` + as `default` to convert any other objects that #ak.to_list would return + but are not JSON serializable. Converts `array` (many types supported, including all Awkward Arrays and Records) into a JSON string. + This function converts the array into Python objects with #ak.to_list, performs + some conversions to make the data JSON serializable (`nan_string`, `infinity_string`, + `minus_infinity_string`, `complex_record_fields`, `convert_bytes`, `convert_other`), + then uses `json.dumps` to return a string or `json.dump` to write to a file + (depending on the value of `file`). + + If `line_delimited` is True or a line-delimiter string like `"\r\n"`/`os.linesep`, + the output is line-delimited JSON, variously referred to as "ldjson", "ndjson", and + "jsonl". (Use an appropriate file extension!) + + To pretty-print the JSON, set `num_indent_spaces=4, num_readability_spaces=1` (for + example). + Awkward Array types have the following JSON translations. * #ak.types.PrimitiveType: converted into JSON booleans and numbers. @@ -50,52 +82,69 @@ def to_json( * #ak.types.ListType: converted into JSON lists. * #ak.types.RegularType: also converted into JSON lists. JSON (and Python) forms lose information about the regularity of list lengths. - * #ak.types.ListType with parameter `"__array__"` equal to - `"__bytestring__"` or `"__string__"`: converted into JSON strings. + * #ak.types.ListType or #ak.types.RegularType with parameter `"__array__"` + equal to `"string"`: converted into JSON strings. * #ak.types.RecordArray without field names: converted into JSON objects with numbers as strings for keys. * #ak.types.RecordArray with field names: converted into JSON objects. * #ak.types.UnionArray: JSON data are naturally heterogeneous. + If the array contains any NaN (not a number), infinite values, or + imaginary/complex types, `nan_string` or `infinity_string` _must_ be supplied. + + If the array contains any raw bytestrings (`"__array__"` equal to `"bytestring"`), + `convert_bytes` _must_ be supplied. To interpret as strings, use `bytes.decode`. + To Base64-encode, use `lambda x: base64.b64encode(x).decode()`. + + Other non-serializable types are only possible through custom behaviors that + override `__getitem__` (which might return arbitrary Python objects). Use + `convert_other` to detect these types and convert them. + See also #ak.from_json. """ with ak._v2._util.OperationErrorContext( "ak._v2.to_json", dict( array=array, - pretty=pretty, - verbose=verbose, - maxdecimals=maxdecimals, + file=file, + line_delimited=line_delimited, + num_indent_spaces=num_indent_spaces, + num_readability_spaces=num_readability_spaces, nan_string=nan_string, infinity_string=infinity_string, minus_infinity_string=minus_infinity_string, complex_record_fields=complex_record_fields, convert_bytes=convert_bytes, + convert_other=convert_other, ), ): return _impl( array, - pretty, - verbose, - maxdecimals, + file, + line_delimited, + num_indent_spaces, + num_readability_spaces, nan_string, infinity_string, minus_infinity_string, complex_record_fields, convert_bytes, + convert_other, ) def _impl( array, - pretty, - verbose, - maxdecimals, + file, + line_delimited, + num_indent_spaces, + num_readability_spaces, nan_string, infinity_string, minus_infinity_string, complex_record_fields, convert_bytes, + convert_other, ): if array is None or isinstance(array, (bool, str, bytes, Number)): return json.dumps(array) @@ -110,13 +159,13 @@ def _impl( out = array.layout elif isinstance(array, ak._v2.highlevel.Record): - out = array.layout + out = array.layout.array[array.layout.at : array.layout.at + 1] elif isinstance(array, ak._v2.highlevel.ArrayBuilder): out = array.snapshot().layout elif isinstance(array, ak._v2.record.Record): - out = array + out = array.array[array.at : array.at + 1] elif isinstance(array, ak.layout.ArrayBuilder): formstr, length, buffers = array.to_buffers() @@ -125,9 +174,6 @@ def _impl( out = ak._v2.operations.convert.from_buffers( form, length, buffers, highlevel=False ) - # FIXME: the code is a copy from snapshot, - # because this call returns v1: - # out = array.snapshot() elif isinstance(array, ak._v2.contents.Content): out = array @@ -144,7 +190,110 @@ def _impl( behavior=ak._v2._util.behavior_of(array), ) + if line_delimited and not ak._v2._util.isstr(line_delimited): + line_delimited = "\n" + + separators = ( + "," + " " * num_readability_spaces, + ":" + " " * num_readability_spaces, + ) + + if file is not None: + if ak._v2._util.isstr(file): + parsed_url = urlparse(file) + if parsed_url.scheme == "" or parsed_url.netloc == "": + + def opener(): + return open(file, "w", encoding="utf8") + + else: + import fsspec + + def opener(): + return fsspec.open(file, "w", encoding="utf8") + + else: + + def opener(): + return _NoContextManager(file) + try: - return json.dumps(jsondata, separators=(",", ":")) + if line_delimited: + if file is None: + out = [] + for datum in jsondata: + out.append( + json.dumps( + datum, + skipkeys=True, + ensure_ascii=True, + check_circular=False, + allow_nan=False, + indent=None, + separators=separators, + default=convert_other, + sort_keys=False, + ) + ) + return line_delimited.join(out) + + else: + with opener() as file: + json.dump( + datum, + file, + skipkeys=True, + ensure_ascii=True, + check_circular=False, + allow_nan=False, + indent=None, + separators=separators, + default=convert_other, + sort_keys=False, + ) + file.write(line_delimited) + + else: + if isinstance(array, (ak._v2.highlevel.Record, ak._v2.record.Record)): + jsondata = jsondata[0] + + if file is None: + return json.dumps( + jsondata, + skipkeys=True, + ensure_ascii=True, + check_circular=False, + allow_nan=False, + indent=num_indent_spaces, + separators=separators, + default=convert_other, + sort_keys=False, + ) + else: + with opener() as file: + return json.dump( + jsondata, + file, + skipkeys=True, + ensure_ascii=True, + check_circular=False, + allow_nan=False, + indent=num_indent_spaces, + separators=separators, + default=convert_other, + sort_keys=False, + ) + except Exception as err: raise ak._v2._util.error(err) + + +class _NoContextManager: + def __init__(self, file): + self.file = file + + def __enter__(self): + return self.file + + def __exit__(self, exception_type, exception_value, exception_traceback): + pass diff --git a/src/awkward/_v2/operations/convert/ak_to_json_file.py b/src/awkward/_v2/operations/convert/ak_to_json_file.py deleted file mode 100644 index fd636eb31a..0000000000 --- a/src/awkward/_v2/operations/convert/ak_to_json_file.py +++ /dev/null @@ -1,150 +0,0 @@ -# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE - -import awkward as ak -import json -from numbers import Number - -np = ak.nplike.NumpyMetadata.instance() - - -def to_json_file( - array, - destination=None, - pretty=False, - maxdecimals=None, - nan_string=None, - infinity_string=None, - minus_infinity_string=None, - complex_record_fields=None, - convert_bytes=None, -): - """ - Args: - array: Data to convert to JSON. - destination (str): a file name to write to (overwrite) that - file (returning None). - pretty (bool): If True, indent the output for human readability; if - False, output compact JSON without spaces. - maxdecimals (None or int): If an int, limit the number of - floating-point decimals to this number; if None, write all digits. - nan_string (None or str): If not None, floating-point NaN values will be - replaced with this string instead of a JSON number. - infinity_string (None or str): If not None, floating-point positive infinity - values will be replaced with this string instead of a JSON number. - minus_infinity_string (None or str): If not None, floating-point negative - infinity values will be replaced with this string instead of a JSON - number. - complex_record_fields (None or (str, str)): If not None, defines a pair of - field names to interpret records as complex numbers. - convert_bytes (None or function): If not None, this function is applied to - all Python 3 bytes objects to produce something JSON serializable, - such as a string using UTF-8 or Base-64 encoding, lists of integers, etc. - - Converts `array` (many types supported, including all Awkward Arrays and - Records) into a JSON file. - - Awkward Array types have the following JSON translations. - - * #ak.types.PrimitiveType: converted into JSON booleans and numbers. - * #ak.types.OptionType: missing values are converted into None. - * #ak.types.ListType: converted into JSON lists. - * #ak.types.RegularType: also converted into JSON lists. JSON (and - Python) forms lose information about the regularity of list lengths. - * #ak.types.ListType with parameter `"__array__"` equal to - `"__bytestring__"` or `"__string__"`: converted into JSON strings. - * #ak.types.RecordArray without field names: converted into JSON - objects with numbers as strings for keys. - * #ak.types.RecordArray with field names: converted into JSON objects. - * #ak.types.UnionArray: JSON data are naturally heterogeneous. - - See also #ak.from_json. - """ - with ak._v2._util.OperationErrorContext( - "ak._v2.to_json_file", - dict( - array=array, - destination=destination, - pretty=pretty, - maxdecimals=maxdecimals, - nan_string=nan_string, - infinity_string=infinity_string, - minus_infinity_string=minus_infinity_string, - complex_record_fields=complex_record_fields, - convert_bytes=convert_bytes, - ), - ): - return _impl( - array, - destination, - pretty, - maxdecimals, - nan_string, - infinity_string, - minus_infinity_string, - complex_record_fields, - convert_bytes, - ) - - -def _impl( - array, - destination, - pretty, - maxdecimals, - nan_string, - infinity_string, - minus_infinity_string, - complex_record_fields, - convert_bytes, -): - if array is None or isinstance(array, (bool, str, bytes, Number)): - return json.dump(array) - - elif isinstance(array, bytes): - return json.dump(array.decode("utf-8", "surrogateescape")) - - elif isinstance(array, np.ndarray): - out = ak._v2.contents.NumpyArray(array) - - elif isinstance(array, ak._v2.highlevel.Array): - out = array.layout - - elif isinstance(array, ak._v2.highlevel.Record): - out = array.layout - - elif isinstance(array, ak._v2.highlevel.ArrayBuilder): - out = array.snapshot().layout - - elif isinstance(array, ak._v2.record.Record): - out = array - - elif isinstance(array, ak.layout.ArrayBuilder): - formstr, length, buffers = array.to_buffers() - form = ak._v2.forms.from_json(formstr) - - out = ak._v2.operations.convert.from_buffers( - form, length, buffers, highlevel=False - ) - # FIXME: the code is a copy from snapshot, - # because this call returns v1: - # out = array.snapshot() - - elif isinstance(array, ak._v2.contents.Content): - out = array - - else: - raise ak._v2._util.error(TypeError(f"unrecognized array type: {repr(array)}")) - - with open(destination, "w", encoding="utf-8") as file: - jsondata = out.to_json( - nan_string=nan_string, - infinity_string=infinity_string, - minus_infinity_string=minus_infinity_string, - complex_record_fields=complex_record_fields, - convert_bytes=convert_bytes, - behavior=ak._v2._util.behavior_of(array), - ) - try: - json.dump(jsondata, file, separators=(",", ":")) - except Exception as err: - raise ak._v2._util.error(err) diff --git a/src/awkward/_v2/record.py b/src/awkward/_v2/record.py index f172dfd654..85a3634f63 100644 --- a/src/awkward/_v2/record.py +++ b/src/awkward/_v2/record.py @@ -189,7 +189,7 @@ def to_list(self, behavior=None): if cls is not ak._v2.highlevel.Record: return cls(self) - return self._array[self._at : self._at + 1].to_list(behavior)[0] + return self._array[self._at : self._at + 1]._to_list(behavior, None)[0] def deep_copy(self): return Record(self._array.deep_copy(), copy.deepcopy(self._at)) diff --git a/tests/v2/test_0019-use-json-library.py b/tests/v2/test_0019-use-json-library.py index 8e7c839975..c985801ae9 100644 --- a/tests/v2/test_0019-use-json-library.py +++ b/tests/v2/test_0019-use-json-library.py @@ -151,9 +151,9 @@ def test_complex_with_nan_and_inf(): def test_tofile(tmp_path): - ak._v2.operations.convert.to_json_file( + ak._v2.operations.convert.to_json( ak._v2.operations.convert.from_json("[[1.1,2.2,3],[],[4,5.5]]"), - os.path.join(str(tmp_path), "tmp1.json"), + file=os.path.join(str(tmp_path), "tmp1.json"), ) with open(os.path.join(str(tmp_path), "tmp1.json")) as f: @@ -258,8 +258,8 @@ def test_numpy(): ) ) assert ( - ak._v2.operations.convert.to_json(b3) - == "[[[1.1,2.2,3.3],[4.4,5.5,6.6]],[[10.1,20.2,Infinity],[40.4,50.5,60.6]]]" + ak._v2.operations.convert.to_json(b3, infinity_string="Infinity") + == '[[[1.1,2.2,3.3],[4.4,5.5,6.6]],[[10.1,20.2,"Infinity"],[40.4,50.5,60.6]]]' ) b4 = ak._v2.contents.NumpyArray( np.array( @@ -270,8 +270,8 @@ def test_numpy(): ) ) assert ( - ak._v2.operations.convert.to_json(b4) - == "[[[1.1,2.2,3.3],[4.4,5.5,6.6]],[[10.1,20.2,-Infinity],[40.4,50.5,60.6]]]" + ak._v2.operations.convert.to_json(b4, minus_infinity_string="-Infinity") + == '[[[1.1,2.2,3.3],[4.4,5.5,6.6]],[[10.1,20.2,"-Infinity"],[40.4,50.5,60.6]]]' ) c = ak._v2.contents.NumpyArray( np.array([[True, False, True], [False, False, True]]) From 44db2b004736537aacf7bf34246c30d3a9f0f784 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 18:54:49 -0500 Subject: [PATCH 08/13] Test all of the 'ak._v2.to_json' options. --- .../_v2/operations/convert/ak_to_json.py | 33 +-- ...est_1449-v2-to_json-from_json-functions.py | 237 ++++++++++++++++++ 2 files changed, 255 insertions(+), 15 deletions(-) create mode 100644 tests/v2/test_1449-v2-to_json-from_json-functions.py diff --git a/src/awkward/_v2/operations/convert/ak_to_json.py b/src/awkward/_v2/operations/convert/ak_to_json.py index 256f02d1c1..5e61b08d8a 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json.py +++ b/src/awkward/_v2/operations/convert/ak_to_json.py @@ -51,7 +51,8 @@ def to_json( infinity values will be replaced with this string instead of a JSON number. complex_record_fields (None or (str, str)): If not None, defines a pair of - field names to interpret records as complex numbers. + field names to interpret records as complex numbers, such as + `("real", "imag")`. convert_bytes (None or function): If not None, this function is applied to all Python 3 bytes objects to produce something JSON serializable, such as a string using UTF-8 or Base64 encoding, lists of integers, etc. @@ -235,23 +236,25 @@ def opener(): sort_keys=False, ) ) - return line_delimited.join(out) + out.append(line_delimited) + return "".join(out) else: with opener() as file: - json.dump( - datum, - file, - skipkeys=True, - ensure_ascii=True, - check_circular=False, - allow_nan=False, - indent=None, - separators=separators, - default=convert_other, - sort_keys=False, - ) - file.write(line_delimited) + for datum in jsondata: + json.dump( + datum, + file, + skipkeys=True, + ensure_ascii=True, + check_circular=False, + allow_nan=False, + indent=None, + separators=separators, + default=convert_other, + sort_keys=False, + ) + file.write(line_delimited) else: if isinstance(array, (ak._v2.highlevel.Record, ak._v2.record.Record)): diff --git a/tests/v2/test_1449-v2-to_json-from_json-functions.py b/tests/v2/test_1449-v2-to_json-from_json-functions.py new file mode 100644 index 0000000000..dfcf07693b --- /dev/null +++ b/tests/v2/test_1449-v2-to_json-from_json-functions.py @@ -0,0 +1,237 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import os +import base64 + +import pytest # noqa: F401 +import numpy as np # noqa: F401 +import awkward as ak # noqa: F401 + + +def test_to_json_options(tmp_path): + filename = os.path.join(tmp_path, "whatever.json") + + array = ak._v2.Array( + [ + {"x": 1.1, "y": 1 + 1j, "z": b"one"}, + {"x": 2.2, "y": 2 + 2j, "z": b"two"}, + {"x": 3.3, "y": 3 + 3j, "z": b"three"}, + {"x": float("nan"), "y": float("nan"), "z": b"four"}, + {"x": float("inf"), "y": float("inf") + 5j, "z": b"five"}, + {"x": float("-inf"), "y": 6 + float("-inf") * 1j, "z": b"six"}, + {"x": 7.7, "y": 7 + 7j, "z": b"seven"}, + {"x": None, "y": 8 + 8j, "z": b"eight"}, + {"x": 9.9, "y": 9 + 9j, "z": b"nine"}, + ] + ) + + kwargs = { + "nan_string": "nan", + "infinity_string": "inf", + "minus_infinity_string": "-inf", + "complex_record_fields": ("real", "imag"), + "convert_bytes": lambda x: base64.b64encode(x).decode(), + } + + expectation = '[{"x":1.1,"y":{"real":1.0,"imag":1.0},"z":"b25l"},{"x":2.2,"y":{"real":2.0,"imag":2.0},"z":"dHdv"},{"x":3.3,"y":{"real":3.0,"imag":3.0},"z":"dGhyZWU="},{"x":"nan","y":{"real":"nan","imag":0.0},"z":"Zm91cg=="},{"x":"inf","y":{"real":"inf","imag":5.0},"z":"Zml2ZQ=="},{"x":"-inf","y":{"real":"nan","imag":"-inf"},"z":"c2l4"},{"x":7.7,"y":{"real":7.0,"imag":7.0},"z":"c2V2ZW4="},{"x":null,"y":{"real":8.0,"imag":8.0},"z":"ZWlnaHQ="},{"x":9.9,"y":{"real":9.0,"imag":9.0},"z":"bmluZQ=="}]' + + assert ak._v2.to_json(array, **kwargs) == expectation + + ak._v2.to_json(array, filename, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + with open(filename, "w") as file: + ak._v2.to_json(array, file, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + expectation = '{"x":1.1,"y":{"real":1.0,"imag":1.0},"z":"b25l"}' + + assert ak._v2.to_json(array[0], **kwargs) == expectation + + ak._v2.to_json(array[0], filename, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + with open(filename, "w") as file: + ak._v2.to_json(array[0], file, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + expectation = """[ + { + "x": 1.1, + "y": { + "real": 1.0, + "imag": 1.0 + }, + "z": "b25l" + }, + { + "x": 2.2, + "y": { + "real": 2.0, + "imag": 2.0 + }, + "z": "dHdv" + }, + { + "x": 3.3, + "y": { + "real": 3.0, + "imag": 3.0 + }, + "z": "dGhyZWU=" + }, + { + "x": "nan", + "y": { + "real": "nan", + "imag": 0.0 + }, + "z": "Zm91cg==" + }, + { + "x": "inf", + "y": { + "real": "inf", + "imag": 5.0 + }, + "z": "Zml2ZQ==" + }, + { + "x": "-inf", + "y": { + "real": "nan", + "imag": "-inf" + }, + "z": "c2l4" + }, + { + "x": 7.7, + "y": { + "real": 7.0, + "imag": 7.0 + }, + "z": "c2V2ZW4=" + }, + { + "x": null, + "y": { + "real": 8.0, + "imag": 8.0 + }, + "z": "ZWlnaHQ=" + }, + { + "x": 9.9, + "y": { + "real": 9.0, + "imag": 9.0 + }, + "z": "bmluZQ==" + } +]""" + + assert ( + ak._v2.to_json( + array, num_indent_spaces=4, num_readability_spaces=1, **kwargs + ).replace(" \n", "\n") + == expectation + ) + + ak._v2.to_json( + array, filename, num_indent_spaces=4, num_readability_spaces=1, **kwargs + ) + with open(filename) as file: + assert file.read().replace(" \n", "\n") == expectation + + with open(filename, "w") as file: + ak._v2.to_json( + array, file, num_indent_spaces=4, num_readability_spaces=1, **kwargs + ) + with open(filename) as file: + assert file.read().replace(" \n", "\n") == expectation + + expectation = """{ + "x": 1.1, + "y": { + "real": 1.0, + "imag": 1.0 + }, + "z": "b25l" +}""" + + assert ( + ak._v2.to_json( + array[0], num_indent_spaces=4, num_readability_spaces=1, **kwargs + ).replace(" \n", "\n") + == expectation + ) + + ak._v2.to_json( + array[0], filename, num_indent_spaces=4, num_readability_spaces=1, **kwargs + ) + with open(filename) as file: + assert file.read().replace(" \n", "\n") == expectation + + with open(filename, "w") as file: + ak._v2.to_json( + array[0], file, num_indent_spaces=4, num_readability_spaces=1, **kwargs + ) + with open(filename) as file: + assert file.read().replace(" \n", "\n") == expectation + + expectation = """{"x":1.1,"y":{"real":1.0,"imag":1.0},"z":"b25l"} +{"x":2.2,"y":{"real":2.0,"imag":2.0},"z":"dHdv"} +{"x":3.3,"y":{"real":3.0,"imag":3.0},"z":"dGhyZWU="} +{"x":"nan","y":{"real":"nan","imag":0.0},"z":"Zm91cg=="} +{"x":"inf","y":{"real":"inf","imag":5.0},"z":"Zml2ZQ=="} +{"x":"-inf","y":{"real":"nan","imag":"-inf"},"z":"c2l4"} +{"x":7.7,"y":{"real":7.0,"imag":7.0},"z":"c2V2ZW4="} +{"x":null,"y":{"real":8.0,"imag":8.0},"z":"ZWlnaHQ="} +{"x":9.9,"y":{"real":9.0,"imag":9.0},"z":"bmluZQ=="} +""" + + assert ak._v2.to_json(array, line_delimited=True, **kwargs) == expectation + + ak._v2.to_json(array, filename, line_delimited=True, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + with open(filename, "w") as file: + ak._v2.to_json(array, file, line_delimited=True, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + expectation = """{"x":1.1,"y":{"real":1.0,"imag":1.0},"z":"b25l"} +""" + + assert ak._v2.to_json(array[0], line_delimited=True, **kwargs) == expectation + + ak._v2.to_json(array[0], filename, line_delimited=True, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + with open(filename, "w") as file: + ak._v2.to_json(array[0], file, line_delimited=True, **kwargs) + with open(filename) as file: + assert file.read() == expectation + + expectation = """{"x":1.1,"y":{"real":1.0,"imag":1.0},"z":"b25l"} +{"x":2.2,"y":{"real":2.0,"imag":2.0},"z":"dHdv"} +{"x":3.3,"y":{"real":3.0,"imag":3.0},"z":"dGhyZWU="} +{"x":"nan","y":{"real":"nan","imag":0.0},"z":"Zm91cg=="} +{"x":"inf","y":{"real":"inf","imag":5.0},"z":"Zml2ZQ=="} +{"x":"-inf","y":{"real":"nan","imag":"-inf"},"z":"c2l4"} +{"x":7.7,"y":{"real":7.0,"imag":7.0},"z":"c2V2ZW4="} +{"x":null,"y":{"real":8.0,"imag":8.0},"z":"ZWlnaHQ="} +{"x":9.9,"y":{"real":9.0,"imag":9.0},"z":"bmluZQ=="} +""" + + with open(filename, "w") as file: + for x in array: + ak._v2.to_json(x, file, line_delimited=True, **kwargs) + with open(filename) as file: + assert file.read() == expectation From 50ff3bd7afd695b50a288b818477b816d5034f09 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Apr 2022 19:18:07 -0500 Subject: [PATCH 09/13] Satisfy pylint. --- src/awkward/_v2/operations/convert/ak_to_json.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/awkward/_v2/operations/convert/ak_to_json.py b/src/awkward/_v2/operations/convert/ak_to_json.py index 5e61b08d8a..bdb599dddb 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json.py +++ b/src/awkward/_v2/operations/convert/ak_to_json.py @@ -240,11 +240,11 @@ def opener(): return "".join(out) else: - with opener() as file: + with opener() as openfile: for datum in jsondata: json.dump( datum, - file, + openfile, skipkeys=True, ensure_ascii=True, check_circular=False, @@ -273,10 +273,10 @@ def opener(): sort_keys=False, ) else: - with opener() as file: + with opener() as openfile: return json.dump( jsondata, - file, + openfile, skipkeys=True, ensure_ascii=True, check_circular=False, From 50166cd0e3173fea109f5366b2ea5e80b76e160d Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 28 Apr 2022 07:04:28 -0500 Subject: [PATCH 10/13] Fix file -> openfile bug, introduced in last commit. --- src/awkward/_v2/operations/convert/ak_to_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/_v2/operations/convert/ak_to_json.py b/src/awkward/_v2/operations/convert/ak_to_json.py index bdb599dddb..0425e910dc 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json.py +++ b/src/awkward/_v2/operations/convert/ak_to_json.py @@ -254,7 +254,7 @@ def opener(): default=convert_other, sort_keys=False, ) - file.write(line_delimited) + openfile.write(line_delimited) else: if isinstance(array, (ak._v2.highlevel.Record, ak._v2.record.Record)): From 97d1f62abca8c2d1a9feb0336a6db2d3c36db96c Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 28 Apr 2022 19:09:22 -0500 Subject: [PATCH 11/13] Implemented 'ak._ext.fromjsonobj' to parse JSON from a file-like object. --- include/awkward/io/json.h | 64 ++++++++++++----- include/awkward/python/io.h | 3 + src/libawkward/io/json.cpp | 132 ++++++++++++++++++++++++++++++++++++ src/python/_ext.cpp | 1 + src/python/io.cpp | 66 ++++++++++++++++++ 5 files changed, 250 insertions(+), 16 deletions(-) diff --git a/include/awkward/io/json.h b/include/awkward/io/json.h index 6482a7c200..fc708e9ab9 100644 --- a/include/awkward/io/json.h +++ b/include/awkward/io/json.h @@ -329,14 +329,13 @@ namespace awkward { /// ArrayBuilder. /// /// @param source Null-terminated string containing any valid JSON data. - /// @param options Configuration options for building an array with an - /// ArrayBuilder. - /// @param nan_string user-defined string for a not-a-number (NaN) value - /// representation in JSON format - /// @param infinity_string user-defined string for a positive infinity - /// representation in JSON format - /// @param minus_infinity_string user-defined string for a negative - /// infinity representation in JSON format + /// @param builder To build the array. + /// @param nan_string User-defined string for a not-a-number (NaN) value + /// representation in JSON format. + /// @param infinity_string User-defined string for a positive infinity + /// representation in JSON format. + /// @param minus_infinity_string User-defined string for a negative + /// infinity representation in JSON format. LIBAWKWARD_EXPORT_SYMBOL int64_t FromJsonString(const char* source, ArrayBuilder& builder, @@ -348,15 +347,14 @@ namespace awkward { /// ArrayBuilder. /// /// @param source C file handle to a file containing any valid JSON data. - /// @param options Configuration options for building an array with an - /// ArrayBuilder. + /// @param builder To build the array. /// @param buffersize Number of bytes for an intermediate buffer. - /// @param nan_string user-defined string for a not-a-number (NaN) value - /// representation in JSON format - /// @param infinity_string user-defined string for a positive infinity - /// representation in JSON format - /// @param minus_infinity_string user-defined string for a negative - /// infinity representation in JSON format + /// @param nan_string User-defined string for a not-a-number (NaN) value + /// representation in JSON format. + /// @param infinity_string User-defined string for a positive infinity + /// representation in JSON format. + /// @param minus_infinity_string User-defined string for a negative + /// infinity representation in JSON format. LIBAWKWARD_EXPORT_SYMBOL int64_t FromJsonFile(FILE* source, ArrayBuilder& builder, @@ -365,6 +363,40 @@ namespace awkward { const char* infinity_string = nullptr, const char* minus_infinity_string = nullptr); + /// @class FileLikeObject + /// + /// @brief Abstract class to represent a file-like object, something with + /// a `read(num_bytes)` method. Satisfies RapidJSON's Stream interface. + class FileLikeObject { + public: + virtual int64_t read(int64_t num_bytes, char* buffer) = 0; + }; + + /// @brief Parses a JSON-encoded file-like object using an + /// ArrayBuilder. + /// + /// @param source File-like object wrapped with the FileLikeObject + /// abstraction (borrowed reference). + /// @param builder To build the array. + /// @param buffersize Number of bytes for an intermediate buffer. + /// @param read_one If true, read only one JSON object (with an error if + /// there's more); otherwise, read a stream of concatenated objects (may + /// be separated by newlines, but we don't check). + /// @param nan_string User-defined string for a not-a-number (NaN) value + /// representation in JSON format. + /// @param infinity_string User-defined string for a positive infinity + /// representation in JSON format. + /// @param minus_infinity_string User-defined string for a negative + /// infinity representation in JSON format. + LIBAWKWARD_EXPORT_SYMBOL int64_t + FromJsonObject(FileLikeObject* source, + ArrayBuilder& builder, + int64_t buffersize, + bool read_one, + const char* nan_string = nullptr, + const char* infinity_string = nullptr, + const char* minus_infinity_string = nullptr); + } #endif // AWKWARD_IO_JSON_H_ diff --git a/include/awkward/python/io.h b/include/awkward/python/io.h index 59f4ed526e..ed6fb18a5e 100644 --- a/include/awkward/python/io.h +++ b/include/awkward/python/io.h @@ -13,6 +13,9 @@ make_fromjson(py::module& m, const std::string& name); void make_fromjsonfile(py::module& m, const std::string& name); +void +make_fromjsonobj(py::module& m, const std::string& name); + void make_uproot_issue_90(py::module& m); diff --git a/src/libawkward/io/json.cpp b/src/libawkward/io/json.cpp index 4ef830c368..4e18aee6e9 100644 --- a/src/libawkward/io/json.cpp +++ b/src/libawkward/io/json.cpp @@ -838,4 +838,136 @@ namespace awkward { minus_infinity_string); return do_parse(handler, reader, stream); } + + class FileLikeObjectStream { + public: + typedef char Ch; + + FileLikeObjectStream(FileLikeObject* source, int64_t buffersize) + : source_(source) + , buffersize_(buffersize) + , bufferlast_(0) + , current_(0) + , readcount_(0) + , count_(0) + , eof_(false) { + buffer_ = new char[buffersize]; + read(); + } + + ~FileLikeObjectStream() { + delete [] buffer_; + } + + Ch Peek() const { + return *current_; + } + Ch Take() { + Ch c = *current_; + read(); + return c; + } + size_t Tell() const { + return count_ + static_cast(current_ - buffer_); + } + + // not implemented + void Put(Ch) { assert(false); } + void Flush() { assert(false); } + Ch* PutBegin() { assert(false); return 0; } + size_t PutEnd(Ch*) { assert(false); return 0; } + + private: + void read() { + if (current_ < bufferlast_) { + ++current_; + } + else if (!eof_) { + count_ += readcount_; + readcount_ = source_->read(buffersize_, buffer_); + bufferlast_ = buffer_ + readcount_ - 1; + current_ = buffer_; + + if (readcount_ < buffersize_) { + buffer_[readcount_] = '\0'; + ++bufferlast_; + eof_ = true; + } + } + } + + FileLikeObject* source_; + int64_t buffersize_; + Ch* buffer_; + Ch* bufferlast_; + Ch* current_; + int64_t readcount_; + int64_t count_; + bool eof_; + }; + + int64_t + FromJsonObject(FileLikeObject* source, + ArrayBuilder& builder, + int64_t buffersize, + bool read_one, + const char* nan_string, + const char* infinity_string, + const char* minus_infinity_string) { + + rj::Reader reader; + FileLikeObjectStream stream(source, buffersize); + Handler handler(builder, + nan_string, + infinity_string, + minus_infinity_string); + + if (read_one) { + bool fully_parsed = reader.Parse(stream, handler); + if (!fully_parsed) { + throw std::invalid_argument( + std::string("JSON syntax error at char ") + + std::to_string(stream.Tell()) + std::string(": \'") + + stream.Peek() + std::string("\'") + + FILENAME(__LINE__)); + } + return 1; + } + + else { + int64_t number = 0; + while (stream.Peek() != 0) { + handler.reset_moved(); + bool fully_parsed = reader.Parse(stream, handler); + if (handler.moved()) { + if (!fully_parsed) { + if (stream.Peek() == 0) { + throw std::invalid_argument( + std::string("incomplete JSON object at the end of the stream") + + FILENAME(__LINE__)); + } + else { + throw std::invalid_argument( + std::string("JSON syntax error at char ") + + std::to_string(stream.Tell()) + std::string(": \'") + + stream.Peek() + std::string("\'") + + FILENAME(__LINE__)); + } + } + else { + number++; + } + } + else if (stream.Peek() != 0) { + throw std::invalid_argument( + std::string("JSON syntax error at char ") + + std::to_string(stream.Tell()) + std::string(": \'") + + stream.Peek() + std::string("\'") + + FILENAME(__LINE__)); + } + } + return number; + } + + } } diff --git a/src/python/_ext.cpp b/src/python/_ext.cpp index 697e7251bd..3c7af04b56 100644 --- a/src/python/_ext.cpp +++ b/src/python/_ext.cpp @@ -135,6 +135,7 @@ PYBIND11_MODULE(_ext, m) { make_fromjson(m, "fromjson"); make_fromjsonfile(m, "fromjsonfile"); + make_fromjsonobj(m, "fromjsonobj"); make_uproot_issue_90(m); diff --git a/src/python/io.cpp b/src/python/io.cpp index 661b50a412..69fa8c523d 100644 --- a/src/python/io.cpp +++ b/src/python/io.cpp @@ -82,6 +82,72 @@ make_fromjsonfile(py::module& m, const std::string& name) { py::arg("buffersize") = 65536); } +class PythonFileLikeObject : public ak::FileLikeObject { +public: + PythonFileLikeObject(py::object& obj) : obj_(obj) { } + + int64_t read(int64_t num_bytes, char* buffer) { + // assuming that this is being called from code in which the GIL has been released + py::gil_scoped_acquire acquire; + + py::object data = obj_.attr("read")(num_bytes); + + if (!PyBytes_Check(data.ptr())) { + throw py::type_error("obj.read(num_bytes) should return bytes (is the file mode 'rb'?)"); + } + + int64_t num_bytes_read = PyBytes_Size(data.ptr()); + + if (num_bytes_read > num_bytes) { + throw py::type_error("obj.read(num_bytes) returned a larger bytes object than num_bytes"); + } + + std::strncpy(buffer, PyBytes_AsString(data.ptr()), std::min(num_bytes, num_bytes_read)); + + py::gil_scoped_release release; + + return num_bytes_read; + } + +private: + py::object obj_; +}; + +void +make_fromjsonobj(py::module& m, const std::string& name) { + m.def(name.c_str(), + [](py::object& source, + ak::ArrayBuilder& builder, + bool read_one, + int64_t buffersize, + const char* nan_string, + const char* infinity_string, + const char* minus_infinity_string) -> int64_t { + + PythonFileLikeObject obj(source); + + py::gil_scoped_release release; + + int64_t out = ak::FromJsonObject(&obj, + builder, + buffersize, + read_one, + nan_string, + infinity_string, + minus_infinity_string); + + py::gil_scoped_acquire acquire; + + return out; + }, py::arg("source"), + py::arg("builder"), + py::arg("read_one"), + py::arg("buffersize"), + py::arg("nan_string"), + py::arg("infinity_string"), + py::arg("minus_infinity_string")); +} + ////////// Uproot connector void From 2b5c3be3560a46274ef1e2837be3e293b7809975 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 28 Apr 2022 20:21:13 -0500 Subject: [PATCH 12/13] Implemented JSON from file-like objects for no schema. (No tests yet.) --- .../operations/convert/ak_from_json_new.py | 231 ++++++++++++++++++ .../_v2/operations/convert/ak_to_json.py | 16 +- src/libawkward/io/json.cpp | 9 +- 3 files changed, 243 insertions(+), 13 deletions(-) create mode 100644 src/awkward/_v2/operations/convert/ak_from_json_new.py diff --git a/src/awkward/_v2/operations/convert/ak_from_json_new.py b/src/awkward/_v2/operations/convert/ak_from_json_new.py new file mode 100644 index 0000000000..cc42da675f --- /dev/null +++ b/src/awkward/_v2/operations/convert/ak_from_json_new.py @@ -0,0 +1,231 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import pathlib +from urllib.parse import urlparse + +import awkward as ak + +np = ak.nplike.NumpyMetadata.instance() + + +def from_json( + source, + line_delimited=False, + schema=None, + nan_string=None, + infinity_string=None, + minus_infinity_string=None, + complex_record_fields=None, + buffersize=65536, + initial=1024, + resize=1.5, + highlevel=True, + behavior=None, +): + """ + Args: + source (bytes/str, pathlib.Path, or file-like object): Data source of the + JSON-formatted string(s). If bytes/str, the string is parsed. If a + `pathlib.Path`, a file with that name is opened, parsed, and closed. + If that path has a URI protocol (like "https://" or "s3://"), this + function attempts to open the file with the fsspec library. If a + file-like object with a `read` method, this function reads from the + object, but does not close it. + line_delimited (bool): If False, a single JSON document is read as an + entire array or record. If True, this function reads line-delimited + JSON into an array (regardless of how many there are). The line + delimiter is not actually checked, so it may be `"\n"`, `"\r\n"` + or anything else. + schema (None, JSON str or equivalent lists/dicts): If None, the data type + is discovered while parsing. If a JSONSchema, that schema is used to + parse the JSON more quickly by skipping type-discovery. + nan_string (None or str): If not None, strings with this value will be + interpreted as floating-point NaN values. + infinity_string (None or str): If not None, strings with this value will + be interpreted as floating-point positive infinity values. + minus_infinity_string (None or str): If not None, strings with this value + will be interpreted as floating-point negative infinity values. + complex_record_fields (None or (str, str)): If not None, defines a pair of + field names to interpret 2-field records as complex numbers. + buffersize (int): Number of bytes in each read from source: larger + values use more memory but read less frequently. (Python GIL is released + between read events.) + initial (int): Initial size (in bytes) of buffers used by + #ak.layout.ArrayBuilder (see #ak.layout.ArrayBuilderOptions). + resize (float): Resize multiplier for buffers used by + #ak.layout.ArrayBuilder (see #ak.layout.ArrayBuilderOptions); + should be strictly greater than 1. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.layout.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Converts a JSON string into an Awkward Array. + + FIXME: needs documentation. + + See also #ak.to_json. + """ + with ak._v2._util.OperationErrorContext( + "ak._v2.from_json", + dict( + source=source, + line_delimited=line_delimited, + schema=schema, + nan_string=nan_string, + infinity_string=infinity_string, + minus_infinity_string=minus_infinity_string, + complex_record_fields=complex_record_fields, + buffersize=buffersize, + initial=initial, + resize=resize, + highlevel=highlevel, + behavior=behavior, + ), + ): + if schema is None: + return _no_schema( + source, + line_delimited, + nan_string, + infinity_string, + minus_infinity_string, + complex_record_fields, + buffersize, + initial, + resize, + highlevel, + behavior, + ) + + else: + raise ak._v2._util.error(NotImplementedError) + + +class _BytesReader: + __slots__ = ("data", "current") + + def __init__(self, data): + self.data = data + self.current = 0 + + def read(self, num_bytes): + before = self.current + self.current += num_bytes + return self.data[before : self.current] + + def __enter__(self): + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + pass + + +class _NoContextManager: + def __init__(self, file): + self.file = file + + def __enter__(self): + return self.file + + def __exit__(self, exception_type, exception_value, exception_traceback): + pass + + +def _get_reader(source): + if not isinstance(source, pathlib.Path) and isinstance(source, str): + source = source.encode("utf8", errors="surrogateescape") + + if isinstance(source, bytes): + return lambda: _BytesReader(source) + + elif isinstance(source, pathlib.Path): + parsed_url = urlparse(str(source)) + if parsed_url.scheme == "" or parsed_url.netloc == "": + return lambda: open(source, "rb") + else: + import fsspec + + return lambda: fsspec.open(source, "rb").open() + + else: + return lambda: _NoContextManager(source) + + +def _record_to_complex(layout, complex_record_fields): + if complex_record_fields is None: + return layout + + elif ( + isinstance(complex_record_fields, tuple) + and len(complex_record_fields) == 2 + and isinstance(complex_record_fields[0], str) + and isinstance(complex_record_fields[1], str) + ): + + def action(node, **kwargs): + if isinstance(node, ak._v2.contents.RecordArray): + if set(node.fields) == set(complex_record_fields): + real = node._getitem_field(complex_record_fields[0]) + imag = node._getitem_field(complex_record_fields[1]) + if ( + isinstance(real, ak._v2.contents.NumpyArray) + and len(real.shape) == 1 + and isinstance(imag, ak._v2.contents.NumpyArray) + and len(imag.shape) == 1 + ): + return ak._v2.contents.NumpyArray( + node._nplike.asarray(real) + node._nplike.asarray(imag) * 1j + ) + + return layout.recursively_apply(action) + + else: + raise ak._v2._util.error( + TypeError("complex_record_fields must be None or a pair of strings") + ) + + +def _no_schema( + source, + line_delimited, + nan_string, + infinity_string, + minus_infinity_string, + complex_record_fields, + buffersize, + initial, + resize, + highlevel, + behavior, +): + builder = ak.layout.ArrayBuilder(initial=initial, resize=resize) + + read_one = not line_delimited + + with _get_reader(source)() as obj: + ak._ext.fromjsonobj( + obj, + builder, + read_one, + buffersize, + nan_string, + infinity_string, + minus_infinity_string, + ) + + formstr, length, buffers = builder.to_buffers() + form = ak._v2.forms.from_json(formstr) + layout = ak._v2.operations.convert.from_buffers( + form, length, buffers, highlevel=False + ) + + layout = _record_to_complex(layout, complex_record_fields) + + if read_one: + layout = layout[0] + + if highlevel: + return ak._v2._util.wrap(layout, behavior, highlevel) + else: + return layout diff --git a/src/awkward/_v2/operations/convert/ak_to_json.py b/src/awkward/_v2/operations/convert/ak_to_json.py index 0425e910dc..1fc385f6d8 100644 --- a/src/awkward/_v2/operations/convert/ak_to_json.py +++ b/src/awkward/_v2/operations/convert/ak_to_json.py @@ -1,6 +1,7 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE import json +import pathlib from urllib.parse import urlparse from numbers import Number @@ -25,10 +26,10 @@ def to_json( """ Args: array: Data to convert to JSON. - file (None, str, or file-like object): If None, this function returns a - JSON-encoded string. Otherwise, this function has no return value. - If a string, this function opens a file with that name, writes JSON - data, and closes the file. If that string has a URI protocol (like + file (None, str/pathlib.Path, or file-like object): If None, this function returns + JSON-encoded bytes. Otherwise, this function has no return value. + If a string/pathlib.Path, this function opens a file with that name, writes JSON + data, and closes the file. If that path has a URI protocol (like "https://" or "s3://"), this function attempts to open the file with the fsspec library. If a file-like object with a `write` method, this function writes to the object, but does not close it. @@ -61,7 +62,8 @@ def to_json( but are not JSON serializable. Converts `array` (many types supported, including all Awkward Arrays and - Records) into a JSON string. + Records) into JSON text. Returns bytes (encoded JSON) if `file` is None; + otherwise, this function returns nothing and writes to a file. This function converts the array into Python objects with #ak.to_list, performs some conversions to make the data JSON serializable (`nan_string`, `infinity_string`, @@ -200,7 +202,7 @@ def _impl( ) if file is not None: - if ak._v2._util.isstr(file): + if ak._v2._util.isstr(file) or isinstance(file, pathlib.Path): parsed_url = urlparse(file) if parsed_url.scheme == "" or parsed_url.netloc == "": @@ -211,7 +213,7 @@ def opener(): import fsspec def opener(): - return fsspec.open(file, "w", encoding="utf8") + return fsspec.open(file, "w", encoding="utf8").open() else: diff --git a/src/libawkward/io/json.cpp b/src/libawkward/io/json.cpp index 4e18aee6e9..d7c20948b6 100644 --- a/src/libawkward/io/json.cpp +++ b/src/libawkward/io/json.cpp @@ -927,8 +927,7 @@ namespace awkward { if (!fully_parsed) { throw std::invalid_argument( std::string("JSON syntax error at char ") - + std::to_string(stream.Tell()) + std::string(": \'") - + stream.Peek() + std::string("\'") + + std::to_string(stream.Tell()) + FILENAME(__LINE__)); } return 1; @@ -949,8 +948,7 @@ namespace awkward { else { throw std::invalid_argument( std::string("JSON syntax error at char ") - + std::to_string(stream.Tell()) + std::string(": \'") - + stream.Peek() + std::string("\'") + + std::to_string(stream.Tell()) + FILENAME(__LINE__)); } } @@ -961,8 +959,7 @@ namespace awkward { else if (stream.Peek() != 0) { throw std::invalid_argument( std::string("JSON syntax error at char ") - + std::to_string(stream.Tell()) + std::string(": \'") - + stream.Peek() + std::string("\'") + + std::to_string(stream.Tell()) + FILENAME(__LINE__)); } } From 2f86ae55a2eb0aab83255623b4776a8ed576059d Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 29 Apr 2022 07:47:50 -0500 Subject: [PATCH 13/13] Satisfy pylint (we do use a 'with' statement, later on). --- src/awkward/_v2/operations/convert/ak_from_json_new.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/_v2/operations/convert/ak_from_json_new.py b/src/awkward/_v2/operations/convert/ak_from_json_new.py index cc42da675f..e605cf2ecf 100644 --- a/src/awkward/_v2/operations/convert/ak_from_json_new.py +++ b/src/awkward/_v2/operations/convert/ak_from_json_new.py @@ -142,7 +142,7 @@ def _get_reader(source): elif isinstance(source, pathlib.Path): parsed_url = urlparse(str(source)) if parsed_url.scheme == "" or parsed_url.netloc == "": - return lambda: open(source, "rb") + return lambda: open(source, "rb") # pylint: disable=R1732 else: import fsspec