From 2c9c0790063276c2ac4238c2d54f4eed86bd9805 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 10 Dec 2020 18:28:50 -0600 Subject: [PATCH 1/8] Added ak.to_buffers with a new interface; old ak.to_arrayset uses it. --- src/awkward/operations/convert.py | 376 ++++++++++++++++++++++-------- 1 file changed, 275 insertions(+), 101 deletions(-) diff --git a/src/awkward/operations/convert.py b/src/awkward/operations/convert.py index 4d0205c557..c7bc120acd 100644 --- a/src/awkward/operations/convert.py +++ b/src/awkward/operations/convert.py @@ -2694,12 +2694,12 @@ def from_parquet( return out -def _arrayset_key( +def _buffers_key( form_key, attribute, partition, prefix, sep, partition_first, ): if form_key is None: raise ValueError( - "cannot read from arrayset using Forms without form_keys" + "cannot ak.from_buffers using Forms without form_keys" + ak._util.exception_suffix(__file__) ) if attribute is None: @@ -2756,6 +2756,9 @@ def to_arrayset( at the end of the keys. This can be relevant if the `container` is sorted or lookup performance depends on alphabetical order. + **Deprecated:** this will be removed in `awkward>=1.1.0` after January 1, + 2021. Use #ak.to_buffers instead: the return values have changed. + Decomposes an Awkward Array into a Form and a collection of arrays, so that data can be losslessly written to file formats and storage devices that only understand named arrays (or binary blobs). @@ -2855,6 +2858,196 @@ def to_arrayset( See also #ak.from_arrayset. """ + + layout = to_layout(array, allow_record=False, allow_other=False) + + if isinstance(layout, ak.partition.PartitionedArray): + show_partition = True + if partition is not None: + raise ValueError( + "array is partitioned; an explicit 'partition' should not be " + "assigned" + ak._util.exception_suffix(__file__) + ) + else: + if partition is None: + show_partition = False + else: + show_partition = True + + if partition is None: + partition = 0 + + def key_format(**v): + v["sep"] = sep + if prefix is None: + v["prefix"] = "" + else: + v["prefix"] = prefix + sep + + if not show_partition: + if v["attribute"] == "data": + return "{prefix}node{node}".format(**v) + else: + return "{prefix}node{node}{sep}{attribute}".format(**v) + + elif partition_first: + if v["attribute"] == "data": + return "{prefix}part{partition}{sep}node{node}".format(**v) + else: + return "{prefix}part{partition}{sep}node{node}{sep}{attribute}".format(**v) + + else: + if v["attribute"] == "data": + return "{prefix}node{node}{sep}part{partition}".format(**v) + else: + return "{prefix}node{node}{sep}{attribute}{sep}part{partition}".format(**v) + + def form_key_format(**v): + return "node{node}".format(**v) + + form, length, container = to_buffers( + layout, + container=container, + partition=partition, + key_format=key_format, + form_key_format=form_key_format, + ) + + if isinstance(length, (numbers.Integral, np.integer)): + num_partitions = None + else: + num_partitions = len(length) + + return form, container, num_partitions + + +def to_buffers( + array, + container=None, + partition=0, + key_format="part{partition}-node{node}-{attribute}", + form_key_format="node{node}", +): + u""" + Args: + array: Data to decompose into named buffers. + container (None or MutableMapping): The str \u2192 NumPy arrays (or + Python buffers) that represent the decomposed Awkward Array. This + `container` is only assumed to have a `__setitem__` method that + accepts strings as keys. + partition (non-negative int): If `array` is not partitioned, this is + the partition number that will be used as part of the container + key. If `array` is partitioned, this will be added to the partition + numbers. + key_format (str or callable): Python format string containing + `"{partition}"`, `"{node}"`, and/or `"{attribute}"` or a function + that takes these as keyword arguments and returns a string to use + as keys for buffers in the `container`. + form_key_format (str, callable, or None): Python format string containing + `"{node}"` or a function that takes this as a keyword argument and + returns a string to use as a `form_key` for each Form node. If None, + the Form nodes have no keys. (They are not required for reconstruction.) + + Decomposes an Awkward Array into a Form and a collection of Python buffers, + so that data can be losslessly written to file formats and storage devices + that only map names to binary blobs (such as a filesystem directory). + + This function returns a 3-tuple: + + (form, length, container) + + where the `form` is a #ak.forms.Form (which can be converted to JSON + with `tojson`), the `length` is either an integer (`len(array)`) or a list + of the lengths of each partition in `array`, and the `container` is either + the MutableMapping you passed in or a new dict containing the buffers (as + NumPy arrays). + + These are also the first three arguments of #ak.from_buffers, so a full + round-trip is + + >>> reconstituted = ak.from_buffers(*ak.to_buffers(original)) + + The `container` argument lets you specify your own MutableMapping, which + might be an interface to some storage format or device (e.g. h5py). It's + okay if the `container` drops NumPy's `dtype` and `shape` information, + leaving raw bytes, since `dtype` and `shape` can be reconstituted from + the #ak.forms.NumpyForm. + + The `partition` argument lets you fill the `container` gradually or in parallel. + If the `array` is not partitioned, the `partition` argument sets its + partition number (for the container keys, through `key_format`). + If the `array` is partitioned, the `partition` argument is added to each + partition number. + + Here is a simple example: + + >>> original = ak.Array([[1, 2, 3], [], [4, 5]]) + >>> form, length, container = ak.to_buffers(original) + >>> form + { + "class": "ListOffsetArray64", + "offsets": "i64", + "content": { + "class": "NumpyArray", + "itemsize": 8, + "format": "l", + "primitive": "int64", + "form_key": "node1" + }, + "form_key": "node0" + } + >>> length + 3 + >>> container + {'part0-node0-offsets': array([0, 3, 3, 5], dtype=int64), + 'part0-node1-data': array([1, 2, 3, 4, 5])} + + which may be read back with + + >>> ak.from_buffers(form, length, container) + + + Here is an example that builds up a partitioned array: + + >>> container = {} + >>> lengths = [] + >>> form, length, _ = ak.to_buffers(ak.Array([[1, 2, 3], [], [4, 5]]), container, 0) + >>> form, length, _ = ak.to_buffers(ak.Array([[6, 7, 8, 9]]), container, 1) + >>> form, length, _ = ak.to_buffers(ak.Array([[], [], []]), container, 2) + >>> form, length, _ = ak.to_buffers(ak.Array([[10]]), container, 3) + >>> form + { + "class": "ListOffsetArray64", + "offsets": "i64", + "content": { + "class": "NumpyArray", + "itemsize": 8, + "format": "l", + "primitive": "int64", + "form_key": "node1" + }, + "form_key": "node0" + } + >>> container + {'part0-node0-offsets': array([0, 3, 3, 5], dtype=int64), + 'part0-node1-data': array([1, 2, 3, 4, 5]), + 'part1-node0-offsets': array([0, 4], dtype=int64), + 'part1-node1-data': array([6, 7, 8, 9]), + 'part2-node0-offsets': array([0, 0, 0, 0], dtype=int64), + 'part2-node1-data': array([], dtype=float64), + 'part3-node0-offsets': array([0, 1], dtype=int64), + 'part3-node1-data': array([10])} + + The object returned by #ak.from_buffers is now a partitioned array: + + >>> reconstituted = ak.from_buffers(form, lengths, container) + >>> reconstituted + + >>> ak.partitions(reconstituted) + [3, 1, 3, 1] + + See also #ak.from_buffers. + """ if container is None: container = {} @@ -2876,33 +3069,13 @@ def index_form(index): + ak._util.exception_suffix(__file__) ) - if prefix is None: - prefix = "" - else: - prefix = prefix + sep - - if isinstance(node_format, str) or ( - ak._util.py27 and isinstance(node_format, ak._util.unicode) - ): - tmp1 = node_format - - def node_format(x): - return tmp1.format(x) + if isinstance(key_format, str): + key_format = lambda **v: key_format.format(**v) - if isinstance(partition_format, str) or ( - ak._util.py27 and isinstance(partition_format, ak._util.unicode) - ): - tmp2 = partition_format - - def partition_format(x): - return tmp2.format(x) - - def key(key_index, attribute, partition): - if partition is not None: - partition = partition_format(partition) - return _arrayset_key( - node_format(key_index), attribute, partition, prefix, sep, partition_first, - ) + if form_key_format is None: + form_key_format = lambda **v: None + elif isinstance(form_key_format, str): + form_key_format = lambda **v: form_key_format.format(**v) num_form_keys = [0] @@ -2917,15 +3090,17 @@ def fill(layout, part): if has_identities: raise NotImplementedError( - "ak.to_arrayset for an array with Identities" + "ak.to_buffers for an array with Identities" + ak._util.exception_suffix(__file__) ) if isinstance(layout, ak.layout.EmptyArray): array = numpy.asarray(layout) - container[key(key_index, None, part)] = little_endian(array) + key = key_format(node=str(key_index), attribute="data", partition=str(part)) + container[key] = little_endian(array) + return ak.forms.EmptyForm( - has_identities, parameters, node_format(key_index) + has_identities, parameters, form_key_format(node=str(key_index)) ) elif isinstance( @@ -2936,48 +3111,48 @@ def fill(layout, part): ak.layout.IndexedArray64, ), ): - container[key(key_index, "index", part)] = little_endian( - numpy.asarray(layout.index) - ) + key = key_format(node=str(key_index), attribute="index", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.index)) + return ak.forms.IndexedForm( index_form(layout.index), fill(layout.content, part), has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance( layout, (ak.layout.IndexedOptionArray32, ak.layout.IndexedOptionArray64) ): - container[key(key_index, "index", part)] = little_endian( - numpy.asarray(layout.index) - ) + key = key_format(node=str(key_index), attribute="index", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.index)) + return ak.forms.IndexedOptionForm( index_form(layout.index), fill(layout.content, part), has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.ByteMaskedArray): - container[key(key_index, "mask", part)] = little_endian( - numpy.asarray(layout.mask) - ) + key = key_format(node=str(key_index), attribute="mask", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.mask)) + return ak.forms.ByteMaskedForm( index_form(layout.mask), fill(layout.content, part), layout.valid_when, has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.BitMaskedArray): - container[key(key_index, "mask", part)] = little_endian( - numpy.asarray(layout.mask) - ) + key = key_format(node=str(key_index), attribute="mask", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.mask)) + return ak.forms.BitMaskedForm( index_form(layout.mask), fill(layout.content, part), @@ -2985,7 +3160,7 @@ def fill(layout, part): layout.lsb_order, has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.UnmaskedArray): @@ -2993,26 +3168,26 @@ def fill(layout, part): fill(layout.content, part), has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance( layout, (ak.layout.ListArray32, ak.layout.ListArrayU32, ak.layout.ListArray64), ): - container[key(key_index, "starts", part)] = little_endian( - numpy.asarray(layout.starts) - ) - container[key(key_index, "stops", part)] = little_endian( - numpy.asarray(layout.stops) - ) + key = key_format(node=str(key_index), attribute="starts", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.starts)) + + key = key_format(node=str(key_index), attribute="stops", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.stops)) + return ak.forms.ListForm( index_form(layout.starts), index_form(layout.stops), fill(layout.content, part), has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance( @@ -3023,20 +3198,22 @@ def fill(layout, part): ak.layout.ListOffsetArray64, ), ): - container[key(key_index, "offsets", part)] = little_endian( - numpy.asarray(layout.offsets) - ) + key = key_format(node=str(key_index), attribute="offsets", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.offsets)) + return ak.forms.ListOffsetForm( index_form(layout.offsets), fill(layout.content, part), has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.NumpyArray): array = numpy.asarray(layout) - container[key(key_index, None, part)] = little_endian(array) + key = key_format(node=str(key_index), attribute="data", partition=str(part)) + container[key] = little_endian(array) + form = ak.forms.Form.from_numpy(array.dtype) return ak.forms.NumpyForm( layout.shape[1:], @@ -3044,7 +3221,7 @@ def fill(layout, part): form.format, has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.RecordArray): @@ -3057,8 +3234,9 @@ def fill(layout, part): for k in layout.keys(): forms.append(fill(layout[k], part)) keys.append(k) + return ak.forms.RecordForm( - forms, keys, has_identities, parameters, node_format(key_index), + forms, keys, has_identities, parameters, form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.RegularArray): @@ -3067,7 +3245,7 @@ def fill(layout, part): layout.size, has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance( @@ -3081,19 +3259,20 @@ def fill(layout, part): forms = [] for x in layout.contents: forms.append(fill(x, part)) - container[key(key_index, "tags", part)] = little_endian( - numpy.asarray(layout.tags) - ) - container[key(key_index, "index", part)] = little_endian( - numpy.asarray(layout.index) - ) + + key = key_format(node=str(key_index), attribute="tags", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.tags)) + + key = key_format(node=str(key_index), attribute="index", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout.index)) + return ak.forms.UnionForm( index_form(layout.tags), index_form(layout.index), forms, has_identities, parameters, - node_format(key_index), + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.VirtualArray): @@ -3109,16 +3288,12 @@ def fill(layout, part): layout = to_layout(array, allow_record=False, allow_other=False) if isinstance(layout, ak.partition.PartitionedArray): - if partition is not None: - raise ValueError( - "array is partitioned; an explicit 'partition' should not be " - "assigned" + ak._util.exception_suffix(__file__) - ) form = None + length = [] for part, content in enumerate(layout.partitions): num_form_keys[0] = 0 - f = fill(content, part) + f = fill(content, partition + part) if form is None: form = f @@ -3131,18 +3306,17 @@ def fill(layout, part): differs from the first Form: {2}""".format( - part, f.tojson(True, False), form.tojson(True, False) + partition + part, f.tojson(True, False), form.tojson(True, False) ) + ak._util.exception_suffix(__file__) ) - - num_partitions = len(layout.partitions) + length.append(len(content)) else: form = fill(layout, partition) - num_partitions = None + length = len(layout) - return form, container, num_partitions + return form, length, container _index_form_to_dtype = _index_form_to_index = _form_to_layout_class = None @@ -3197,7 +3371,7 @@ def _form_to_layout( if form.has_identities: raise NotImplementedError( - "ak.from_arrayset for an array with Identities" + "ak.from_buffers for an array with Identities" + ak._util.exception_suffix(__file__) ) else: @@ -3208,7 +3382,7 @@ def _form_to_layout( if isinstance(form, ak.forms.BitMaskedForm): raw_mask = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "mask", partition, prefix, sep, partition_first, ) ] @@ -3244,7 +3418,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.ByteMaskedForm): raw_mask = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "mask", partition, prefix, sep, partition_first, ) ] @@ -3277,7 +3451,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.IndexedForm): raw_index = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "index", partition, prefix, sep, partition_first, ) ] @@ -3307,7 +3481,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.IndexedOptionForm): raw_index = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "index", partition, prefix, sep, partition_first, ) ] @@ -3337,7 +3511,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.ListForm): raw_starts = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "starts", partition, prefix, sep, partition_first, ) ] @@ -3349,7 +3523,7 @@ def _form_to_layout( ) raw_stops = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "stops", partition, prefix, sep, partition_first, ) ] @@ -3379,7 +3553,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.ListOffsetForm): raw_offsets = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "offsets", partition, prefix, sep, partition_first, ) ] @@ -3409,7 +3583,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.NumpyForm): raw_array = ( container[ - _arrayset_key( + _buffers_key( form.form_key, None, partition, prefix, sep, partition_first, ) ] @@ -3476,7 +3650,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.UnionForm): raw_tags = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "tags", partition, prefix, sep, partition_first, ) ] @@ -3488,7 +3662,7 @@ def _form_to_layout( ) raw_index = ( container[ - _arrayset_key( + _buffers_key( form.form_key, "index", partition, prefix, sep, partition_first, ) ] @@ -3550,7 +3724,7 @@ def _form_to_layout( generator = ak.layout.ArrayGenerator( _form_to_layout, args, form=form.form, length=length, ) - node_cache_key = _arrayset_key( + node_cache_key = _buffers_key( form.form.form_key, "virtual", partition, prefix, sep, partition_first, ) return ak.layout.VirtualArray( @@ -3565,15 +3739,15 @@ def _form_to_layout( ) -_from_arrayset_key_number = 0 -_from_arrayset_key_lock = threading.Lock() +_from_buffers_key_number = 0 +_from_buffers_key_lock = threading.Lock() -def _from_arrayset_key(): - global _from_arrayset_key_number - with _from_arrayset_key_lock: - out = _from_arrayset_key_number - _from_arrayset_key_number += 1 +def _from_buffers_key(): + global _from_buffers_key_number + with _from_buffers_key_lock: + out = _from_buffers_key_number + _from_buffers_key_number += 1 return out @@ -3733,7 +3907,7 @@ def partition_format(x): lazy_cache = ak.layout.ArrayCache(hold_cache) if lazy_cache_key is None: - lazy_cache_key = "ak.from_arrayset:{0}".format(_from_arrayset_key()) + lazy_cache_key = "ak.from_arrayset:{0}".format(_from_buffers_key()) if num_partitions is None: args = (form, container, None, prefix, sep, partition_first) From 7b0dd95ab3f153fecec1ee9c6fc89ad864f263b2 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 10 Dec 2020 18:33:03 -0600 Subject: [PATCH 2/8] Black and Flake8. --- src/awkward/_util.py | 4 +- src/awkward/operations/convert.py | 72 ++++++++++++++----- tests/test_0107-assign-fields-to-records.py | 12 +++- tests/test_0224-arrow-to-awkward.py | 16 +++-- .../test_0590-allow-regulararray-size-zero.py | 8 ++- 5 files changed, 83 insertions(+), 29 deletions(-) diff --git a/src/awkward/_util.py b/src/awkward/_util.py index 9f883e0bb7..bf1ec4d56d 100644 --- a/src/awkward/_util.py +++ b/src/awkward/_util.py @@ -761,7 +761,9 @@ def apply(inputs, depth, user): outcontent = apply(nextinputs, depth + 1, user) assert isinstance(outcontent, tuple) - return tuple(ak.layout.RegularArray(x, maxsize, maxlen) for x in outcontent) + return tuple( + ak.layout.RegularArray(x, maxsize, maxlen) for x in outcontent + ) elif not all_same_offsets(nplike, inputs): fcns = [ diff --git a/src/awkward/operations/convert.py b/src/awkward/operations/convert.py index c7bc120acd..4e0f91d864 100644 --- a/src/awkward/operations/convert.py +++ b/src/awkward/operations/convert.py @@ -1122,7 +1122,9 @@ def recurse(array, level): starts, stops, recurse(array.content, level + 1) ) for i in range(len(array.starts.shape) - 1, 0, -1): - out = ak.layout.RegularArray(out, array.starts.shape[i], array.starts.shape[i - 1]) + out = ak.layout.RegularArray( + out, array.starts.shape[i], array.starts.shape[i - 1] + ) return out elif isinstance(array, awkward0.Table): @@ -1176,7 +1178,9 @@ def recurse(array, level): ) for i in range(len(array.tags.shape) - 1, 0, -1): - out = ak.layout.RegularArray(out, array.tags.shape[i], array.tags.shape[i - 1]) + out = ak.layout.RegularArray( + out, array.tags.shape[i], array.tags.shape[i - 1] + ) return out elif isinstance(array, awkward0.MaskedArray): @@ -1188,7 +1192,9 @@ def recurse(array, level): valid_when=(not array.maskedwhen), ) for i in range(len(array.mask.shape) - 1, 0, -1): - out = ak.layout.RegularArray(out, array.mask.shape[i], array.mask.shape[i - 1]) + out = ak.layout.RegularArray( + out, array.mask.shape[i], array.mask.shape[i - 1] + ) return out elif isinstance(array, awkward0.BitMaskedArray): @@ -1221,7 +1227,9 @@ def recurse(array, level): index, recurse(array.content, level + 1) ) for i in range(len(array.index.shape) - 1, 0, -1): - out = ak.layout.RegularArray(out, array.index.shape[i], array.index.shape[i - 1]) + out = ak.layout.RegularArray( + out, array.index.shape[i], array.index.shape[i - 1] + ) return out elif isinstance(array, awkward0.IndexedArray): @@ -1239,7 +1247,9 @@ def recurse(array, level): index = ak.layout.Index32(array.index.reshape(-1)) out = ak.layout.IndexedArray32(index, recurse(array.content, level + 1)) for i in range(len(array.index.shape) - 1, 0, -1): - out = ak.layout.RegularArray(out, array.index.shape[i], array.index.shape[i - 1]) + out = ak.layout.RegularArray( + out, array.index.shape[i], array.index.shape[i - 1] + ) return out elif isinstance(array, awkward0.SparseArray): @@ -2894,13 +2904,17 @@ def key_format(**v): if v["attribute"] == "data": return "{prefix}part{partition}{sep}node{node}".format(**v) else: - return "{prefix}part{partition}{sep}node{node}{sep}{attribute}".format(**v) + return "{prefix}part{partition}{sep}node{node}{sep}{attribute}".format( + **v + ) else: if v["attribute"] == "data": return "{prefix}node{node}{sep}part{partition}".format(**v) else: - return "{prefix}node{node}{sep}{attribute}{sep}part{partition}".format(**v) + return "{prefix}node{node}{sep}{attribute}{sep}part{partition}".format( + **v + ) def form_key_format(**v): return "node{node}".format(**v) @@ -3070,12 +3084,12 @@ def index_form(index): ) if isinstance(key_format, str): - key_format = lambda **v: key_format.format(**v) + key_format = lambda **v: key_format.format(**v) # noqa: E731 if form_key_format is None: - form_key_format = lambda **v: None + form_key_format = lambda **v: None # noqa: E731 elif isinstance(form_key_format, str): - form_key_format = lambda **v: form_key_format.format(**v) + form_key_format = lambda **v: form_key_format.format(**v) # noqa: E731 num_form_keys = [0] @@ -3111,7 +3125,9 @@ def fill(layout, part): ak.layout.IndexedArray64, ), ): - key = key_format(node=str(key_index), attribute="index", partition=str(part)) + key = key_format( + node=str(key_index), attribute="index", partition=str(part) + ) container[key] = little_endian(numpy.asarray(layout.index)) return ak.forms.IndexedForm( @@ -3125,7 +3141,9 @@ def fill(layout, part): elif isinstance( layout, (ak.layout.IndexedOptionArray32, ak.layout.IndexedOptionArray64) ): - key = key_format(node=str(key_index), attribute="index", partition=str(part)) + key = key_format( + node=str(key_index), attribute="index", partition=str(part) + ) container[key] = little_endian(numpy.asarray(layout.index)) return ak.forms.IndexedOptionForm( @@ -3175,10 +3193,14 @@ def fill(layout, part): layout, (ak.layout.ListArray32, ak.layout.ListArrayU32, ak.layout.ListArray64), ): - key = key_format(node=str(key_index), attribute="starts", partition=str(part)) + key = key_format( + node=str(key_index), attribute="starts", partition=str(part) + ) container[key] = little_endian(numpy.asarray(layout.starts)) - key = key_format(node=str(key_index), attribute="stops", partition=str(part)) + key = key_format( + node=str(key_index), attribute="stops", partition=str(part) + ) container[key] = little_endian(numpy.asarray(layout.stops)) return ak.forms.ListForm( @@ -3198,7 +3220,9 @@ def fill(layout, part): ak.layout.ListOffsetArray64, ), ): - key = key_format(node=str(key_index), attribute="offsets", partition=str(part)) + key = key_format( + node=str(key_index), attribute="offsets", partition=str(part) + ) container[key] = little_endian(numpy.asarray(layout.offsets)) return ak.forms.ListOffsetForm( @@ -3236,7 +3260,11 @@ def fill(layout, part): keys.append(k) return ak.forms.RecordForm( - forms, keys, has_identities, parameters, form_key_format(node=str(key_index)), + forms, + keys, + has_identities, + parameters, + form_key_format(node=str(key_index)), ) elif isinstance(layout, ak.layout.RegularArray): @@ -3263,7 +3291,9 @@ def fill(layout, part): key = key_format(node=str(key_index), attribute="tags", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.tags)) - key = key_format(node=str(key_index), attribute="index", partition=str(part)) + key = key_format( + node=str(key_index), attribute="index", partition=str(part) + ) container[key] = little_endian(numpy.asarray(layout.index)) return ak.forms.UnionForm( @@ -3306,7 +3336,9 @@ def fill(layout, part): differs from the first Form: {2}""".format( - partition + part, f.tojson(True, False), form.tojson(True, False) + partition + part, + f.tojson(True, False), + form.tojson(True, False), ) + ak._util.exception_suffix(__file__) ) @@ -3645,7 +3677,9 @@ def _form_to_layout( length * form.size, ) - return ak.layout.RegularArray(content, form.size, length, identities, parameters) + return ak.layout.RegularArray( + content, form.size, length, identities, parameters + ) elif isinstance(form, ak.forms.UnionForm): raw_tags = ( diff --git a/tests/test_0107-assign-fields-to-records.py b/tests/test_0107-assign-fields-to-records.py index cc565a1f08..9b9af413c3 100644 --- a/tests/test_0107-assign-fields-to-records.py +++ b/tests/test_0107-assign-fields-to-records.py @@ -102,10 +102,14 @@ def test_regulararray(): np.array([0.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9]) ) recordarray = ak.layout.RecordArray({"x": content}) - regulararray = ak.Array(ak.layout.RegularArray(recordarray, 3, zeros_length=0), check_valid=True) + regulararray = ak.Array( + ak.layout.RegularArray(recordarray, 3, zeros_length=0), check_valid=True + ) content2 = ak.layout.NumpyArray(np.array([100, 200, 300])) - regulararray2 = ak.Array(ak.layout.RegularArray(content2, 1, zeros_length=0), check_valid=True) + regulararray2 = ak.Array( + ak.layout.RegularArray(content2, 1, zeros_length=0), check_valid=True + ) assert ak.to_list(ak.with_field(regulararray, regulararray2, "y")) == [ [{"x": 0.0, "y": 100}, {"x": 1.1, "y": 100}, {"x": 2.2, "y": 100}], [{"x": 3.3, "y": 200}, {"x": 4.4, "y": 200}, {"x": 5.5, "y": 200}], @@ -115,7 +119,9 @@ def test_regulararray(): content2 = ak.layout.NumpyArray( np.array([100, 200, 300, 400, 500, 600, 700, 800, 900]) ) - regulararray2 = ak.Array(ak.layout.RegularArray(content2, 3, zeros_length=0), check_valid=True) + regulararray2 = ak.Array( + ak.layout.RegularArray(content2, 3, zeros_length=0), check_valid=True + ) assert ak.to_list(ak.with_field(regulararray, regulararray2, "y")) == [ [{"x": 0.0, "y": 100}, {"x": 1.1, "y": 200}, {"x": 2.2, "y": 300}], [{"x": 3.3, "y": 400}, {"x": 4.4, "y": 500}, {"x": 5.5, "y": 600}], diff --git a/tests/test_0224-arrow-to-awkward.py b/tests/test_0224-arrow-to-awkward.py index 693cdac495..fb36789d5a 100644 --- a/tests/test_0224-arrow-to-awkward.py +++ b/tests/test_0224-arrow-to-awkward.py @@ -1586,7 +1586,9 @@ def test_arrow_coverage100(): a = ak.layout.IndexedOptionArray32( ak.layout.Index32(np.array([-1, 1, -1, 0, 0, -1], "i4")), ak.layout.RegularArray( - ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), 3, zeros_length=0 + ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), + 3, + zeros_length=0, ), ) assert ak.to_arrow(a).to_pylist() == [ @@ -1601,7 +1603,9 @@ def test_arrow_coverage100(): a = ak.layout.IndexedOptionArray32( ak.layout.Index32(np.array([-1, 1, -1, 0, 0, -1, 1, -1], "i4")), ak.layout.RegularArray( - ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), 3, zeros_length=0 + ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), + 3, + zeros_length=0, ), ) assert ak.to_arrow(a).to_pylist() == [ @@ -1618,7 +1622,9 @@ def test_arrow_coverage100(): a = ak.layout.IndexedOptionArray64( ak.layout.Index64(np.array([-1, 1, -1, 0, 0, -1, 1, -1], "i8")), ak.layout.RegularArray( - ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), 3, zeros_length=0 + ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), + 3, + zeros_length=0, ), ) assert ak.to_arrow(a).to_pylist() == [ @@ -1637,7 +1643,9 @@ def test_arrow_coverage100(): ak.layout.IndexedOptionArray32( ak.layout.Index32(np.array([-1, 1, -1, 0, 0, -1], "i4")), ak.layout.RegularArray( - ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), 3, zeros_length=0 + ak.layout.NumpyArray(np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), + 3, + zeros_length=0, ), ), valid_when=True, diff --git a/tests/test_0590-allow-regulararray-size-zero.py b/tests/test_0590-allow-regulararray-size-zero.py index 6d55283141..bdb9dd6846 100644 --- a/tests/test_0590-allow-regulararray-size-zero.py +++ b/tests/test_0590-allow-regulararray-size-zero.py @@ -7,7 +7,9 @@ import awkward as ak # noqa: F401 -empty = ak.Array(ak.layout.RegularArray(ak.Array([[1, 2, 3], [], [4, 5]]).layout, 0, zeros_length=0)) +empty = ak.Array( + ak.layout.RegularArray(ak.Array([[1, 2, 3], [], [4, 5]]).layout, 0, zeros_length=0) +) def test_ListOffsetArray_rpad_and_clip(): @@ -24,7 +26,9 @@ def test_toListOffsetArray64(): def test_setidentities(): empty2 = ak.Array( - ak.layout.RegularArray(ak.Array([[1, 2, 3], [], [4, 5]]).layout, 0, zeros_length=0) + ak.layout.RegularArray( + ak.Array([[1, 2, 3], [], [4, 5]]).layout, 0, zeros_length=0 + ) ) empty2.layout.setidentities() assert np.asarray(empty2.layout.identities).tolist() == [] From ea616858e692c6f87a30068a4c793be7f14ec521 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 10 Dec 2020 21:18:03 -0600 Subject: [PATCH 3/8] Works again, without modifying to/from_arrayset. --- src/awkward/operations/convert.py | 1062 +++++++++++++++-------------- tests/test_0348-form-keys.py | 1 - tests/test_0384-lazy-arrayset.py | 10 +- 3 files changed, 550 insertions(+), 523 deletions(-) diff --git a/src/awkward/operations/convert.py b/src/awkward/operations/convert.py index 4e0f91d864..821d694fea 100644 --- a/src/awkward/operations/convert.py +++ b/src/awkward/operations/convert.py @@ -2704,243 +2704,12 @@ def from_parquet( return out -def _buffers_key( - form_key, attribute, partition, prefix, sep, partition_first, -): - if form_key is None: - raise ValueError( - "cannot ak.from_buffers using Forms without form_keys" - + ak._util.exception_suffix(__file__) - ) - if attribute is None: - attribute = "" - else: - attribute = sep + attribute - if partition is None: - return "{0}{1}{2}".format(prefix, form_key, attribute,) - elif partition_first: - return "{0}{1}{2}{3}{4}".format(prefix, partition, sep, form_key, attribute,) - else: - return "{0}{1}{2}{3}{4}".format(prefix, form_key, attribute, sep, partition,) - - -def to_arrayset( - array, - container=None, - partition=None, - prefix=None, - node_format="node{0}", - partition_format="part{0}", - sep="-", - partition_first=False, -): - u""" - Args: - array: Data to decompose into an arrayset. - container (None or MutableMapping): The str \u2192 NumPy arrays (or - Python buffers) that represent the decomposed Awkward Array. This - `container` is only assumed to have a `__setitem__` method that - accepts strings as keys. - partition (None or non-negative int): If None and `array` is not - partitioned, keys written to the container have no reference to - partitioning; if an integer and `array` is not partitioned, keys - use this as their partition number; if `array` is partitioned, the - `partition` argument must be None and keys are written with the - array's own internal partition numbers. - prefix (None or str): If None, keys only contain node and partition - information; if a string, keys are all prepended by `prefix + sep`. - node_format (str or callable): Python format string or function - (returning str) of the node part of keys written to the container - and the `form_key` values in the output Form. Its only argument - (`{0}` in the format string) is the node number, unique within the - `array`. - partition_format (str or callable): Python format string or function - (returning str) of the partition part of keys written to the - container (if any). Its only argument (`{0}` in the format string) - is the partition number. - sep (str): Separates the prefix, node part, array attribute (e.g. - `"starts"`, `"stops"`, `"mask"`), and partition part of the - keys written to the container. - partition_first (bool): If True, the partition part appears immediately - after the prefix (if any); if False, the partition part appears - at the end of the keys. This can be relevant if the `container` - is sorted or lookup performance depends on alphabetical order. - - **Deprecated:** this will be removed in `awkward>=1.1.0` after January 1, - 2021. Use #ak.to_buffers instead: the return values have changed. - - Decomposes an Awkward Array into a Form and a collection of arrays, so - that data can be losslessly written to file formats and storage devices - that only understand named arrays (or binary blobs). - - This function returns a 3-tuple: - - (form, container, num_partitions) - - where the `form` is a #ak.forms.Form (which can be converted to JSON - with `tojson`), the `container` is either the MutableMapping you passed in - or a new dict containing the NumPy arrays, and `num_partitions` is None - if `array` was not partitioned or the number of partitions if it was. - - These are also the first three arguments of #ak.from_arrayset, so a full - round-trip is - - >>> reconstituted = ak.from_arrayset(*ak.to_arrayset(original)) - - The `container` argument lets you specify your own MutableMapping, which - might be an interface to some storage format or device (e.g. h5py). It's - okay if the `container` drops NumPy's `dtype` and `shape` information, - leaving raw bytes, since `dtype` and `shape` can be reconstituted from - the #ak.forms.NumpyForm. - - The `partition` argument lets you fill the `container` one partition at a - time using unpartitioned arrays. - - The rest of the arguments determine the format of the keys written to the - `container` (which might be restrictive if it represents a storage device). - - Here is a simple example: - - >>> original = ak.Array([[1, 2, 3], [], [4, 5]]) - >>> form, container, num_partitions = ak.to_arrayset(original) - >>> form - { - "class": "ListOffsetArray64", - "offsets": "i64", - "content": { - "class": "NumpyArray", - "itemsize": 8, - "format": "l", - "primitive": "int64", - "form_key": "node1" - }, - "form_key": "node0" - } - >>> container - {'node0-offsets': array([0, 3, 3, 5], dtype=int64), - 'node1': array([1, 2, 3, 4, 5])} - >>> print(num_partitions) - None - - which may be read back with - - >>> ak.from_arrayset(form, container) - - - (the third argument of #ak.from_arrayset defaults to None). - - Here is an example of building up a partitioned array: - - >>> container = {} - >>> form, _, _ = ak.to_arrayset(ak.Array([[1, 2, 3], [], [4, 5]]), container, 0) - >>> form, _, _ = ak.to_arrayset(ak.Array([[6, 7, 8, 9]]), container, 1) - >>> form, _, _ = ak.to_arrayset(ak.Array([[], [], []]), container, 2) - >>> form, _, _ = ak.to_arrayset(ak.Array([[10]]), container, 3) - >>> form - { - "class": "ListOffsetArray64", - "offsets": "i64", - "content": { - "class": "NumpyArray", - "itemsize": 8, - "format": "l", - "primitive": "int64", - "form_key": "node1" - }, - "form_key": "node0" - } - >>> container - {'node0-offsets-part0': array([0, 3, 3, 5], dtype=int64), - 'node1-part0': array([1, 2, 3, 4, 5]), - 'node0-offsets-part1': array([0, 4], dtype=int64), - 'node1-part1': array([6, 7, 8, 9]), - 'node0-offsets-part2': array([0, 0, 0, 0], dtype=int64), - 'node1-part2': array([], dtype=float64), - 'node0-offsets-part3': array([0, 1], dtype=int64), - 'node1-part3': array([10])} - - The object returned by #ak.from_arrayset is now a partitioned array: - - >>> ak.from_arrayset(form, container, 4) - - >>> ak.partitions(ak.from_arrayset(form, container, 4)) - [3, 1, 3, 1] - - See also #ak.from_arrayset. - """ - - layout = to_layout(array, allow_record=False, allow_other=False) - - if isinstance(layout, ak.partition.PartitionedArray): - show_partition = True - if partition is not None: - raise ValueError( - "array is partitioned; an explicit 'partition' should not be " - "assigned" + ak._util.exception_suffix(__file__) - ) - else: - if partition is None: - show_partition = False - else: - show_partition = True - - if partition is None: - partition = 0 - - def key_format(**v): - v["sep"] = sep - if prefix is None: - v["prefix"] = "" - else: - v["prefix"] = prefix + sep - - if not show_partition: - if v["attribute"] == "data": - return "{prefix}node{node}".format(**v) - else: - return "{prefix}node{node}{sep}{attribute}".format(**v) - - elif partition_first: - if v["attribute"] == "data": - return "{prefix}part{partition}{sep}node{node}".format(**v) - else: - return "{prefix}part{partition}{sep}node{node}{sep}{attribute}".format( - **v - ) - - else: - if v["attribute"] == "data": - return "{prefix}node{node}{sep}part{partition}".format(**v) - else: - return "{prefix}node{node}{sep}{attribute}{sep}part{partition}".format( - **v - ) - - def form_key_format(**v): - return "node{node}".format(**v) - - form, length, container = to_buffers( - layout, - container=container, - partition=partition, - key_format=key_format, - form_key_format=form_key_format, - ) - - if isinstance(length, (numbers.Integral, np.integer)): - num_partitions = None - else: - num_partitions = len(length) - - return form, container, num_partitions - - def to_buffers( array, container=None, - partition=0, - key_format="part{partition}-node{node}-{attribute}", - form_key_format="node{node}", + partition_start=0, + form_key="node{id}", + key_format="part{partition}-{form_key}-{attribute}", ): u""" Args: @@ -2949,18 +2718,21 @@ def to_buffers( Python buffers) that represent the decomposed Awkward Array. This `container` is only assumed to have a `__setitem__` method that accepts strings as keys. - partition (non-negative int): If `array` is not partitioned, this is + partition_start (non-negative int): If `array` is not partitioned, this is the partition number that will be used as part of the container - key. If `array` is partitioned, this will be added to the partition - numbers. + key. If `array` is partitioned, this is the first partition number. + form_key (str, callable): Python format string containing + `"{id}"` or a function that takes non-negative integer as a string + and the current `layout` as keyword arguments and returns a string, + for use as a `form_key` on each Form node and in `key_format` (below). key_format (str or callable): Python format string containing - `"{partition}"`, `"{node}"`, and/or `"{attribute}"` or a function + `"{partition}"`, `"{form_key}"`, and/or `"{attribute}"` or a function that takes these as keyword arguments and returns a string to use - as keys for buffers in the `container`. - form_key_format (str, callable, or None): Python format string containing - `"{node}"` or a function that takes this as a keyword argument and - returns a string to use as a `form_key` for each Form node. If None, - the Form nodes have no keys. (They are not required for reconstruction.) + as keys for buffers in the `container`. The `partition` is a + partition number (non-negative integer, passed as a string), the + `form_key` is the result of applying `form_key` (above), and the + `attribute` is a hard-coded string representing the buffer's function + (e.g. `"data"`, `"offsets"`, `"index"`). Decomposes an Awkward Array into a Form and a collection of Python buffers, so that data can be losslessly written to file formats and storage devices @@ -2987,11 +2759,12 @@ def to_buffers( leaving raw bytes, since `dtype` and `shape` can be reconstituted from the #ak.forms.NumpyForm. - The `partition` argument lets you fill the `container` gradually or in parallel. - If the `array` is not partitioned, the `partition` argument sets its - partition number (for the container keys, through `key_format`). - If the `array` is partitioned, the `partition` argument is added to each - partition number. + The `partition_start` argument lets you fill the `container` gradually or + in parallel. If the `array` is not partitioned, the `partition_start` + argument sets its partition number (for the container keys, through + `key_format`). If the `array` is partitioned, the first partition is numbered + `partition_start` and as many are filled as ar in `array`. See #ak.partitions + to get the number of partitions in `array`. Here is a simple example: @@ -3083,14 +2856,12 @@ def index_form(index): + ak._util.exception_suffix(__file__) ) + if isinstance(form_key, str): + form_key = lambda **v: form_key.format(**v) # noqa: E731 + if isinstance(key_format, str): key_format = lambda **v: key_format.format(**v) # noqa: E731 - if form_key_format is None: - form_key_format = lambda **v: None # noqa: E731 - elif isinstance(form_key_format, str): - form_key_format = lambda **v: form_key_format.format(**v) # noqa: E731 - num_form_keys = [0] def little_endian(array): @@ -3109,13 +2880,10 @@ def fill(layout, part): ) if isinstance(layout, ak.layout.EmptyArray): - array = numpy.asarray(layout) - key = key_format(node=str(key_index), attribute="data", partition=str(part)) - container[key] = little_endian(array) - - return ak.forms.EmptyForm( - has_identities, parameters, form_key_format(node=str(key_index)) - ) + fk = form_key(id=str(key_index)) + key = key_format(form_key=fk, attribute="data", partition=str(part)) + container[key] = little_endian(numpy.asarray(layout)) + return ak.forms.EmptyForm(has_identities, parameters, fk) elif isinstance( layout, @@ -3125,52 +2893,48 @@ def fill(layout, part): ak.layout.IndexedArray64, ), ): - key = key_format( - node=str(key_index), attribute="index", partition=str(part) - ) + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="index", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.index)) - return ak.forms.IndexedForm( index_form(layout.index), fill(layout.content, part), has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance( layout, (ak.layout.IndexedOptionArray32, ak.layout.IndexedOptionArray64) ): - key = key_format( - node=str(key_index), attribute="index", partition=str(part) - ) + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="index", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.index)) - return ak.forms.IndexedOptionForm( index_form(layout.index), fill(layout.content, part), has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance(layout, ak.layout.ByteMaskedArray): - key = key_format(node=str(key_index), attribute="mask", partition=str(part)) + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="mask", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.mask)) - return ak.forms.ByteMaskedForm( index_form(layout.mask), fill(layout.content, part), layout.valid_when, has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance(layout, ak.layout.BitMaskedArray): - key = key_format(node=str(key_index), attribute="mask", partition=str(part)) + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="mask", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.mask)) - return ak.forms.BitMaskedForm( index_form(layout.mask), fill(layout.content, part), @@ -3178,7 +2942,7 @@ def fill(layout, part): layout.lsb_order, has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance(layout, ak.layout.UnmaskedArray): @@ -3186,30 +2950,25 @@ def fill(layout, part): fill(layout.content, part), has_identities, parameters, - form_key_format(node=str(key_index)), + form_key(id=str(key_index), layout=layout), ) elif isinstance( layout, (ak.layout.ListArray32, ak.layout.ListArrayU32, ak.layout.ListArray64), ): - key = key_format( - node=str(key_index), attribute="starts", partition=str(part) - ) + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="starts", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.starts)) - - key = key_format( - node=str(key_index), attribute="stops", partition=str(part) - ) + key = key_format(form_key=fk, attribute="stops", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.stops)) - return ak.forms.ListForm( index_form(layout.starts), index_form(layout.stops), fill(layout.content, part), has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance( @@ -3220,24 +2979,24 @@ def fill(layout, part): ak.layout.ListOffsetArray64, ), ): + fk = form_key(id=str(key_index), layout=layout) key = key_format( - node=str(key_index), attribute="offsets", partition=str(part) + form_key=fk, attribute="offsets", partition=str(part) ) container[key] = little_endian(numpy.asarray(layout.offsets)) - return ak.forms.ListOffsetForm( index_form(layout.offsets), fill(layout.content, part), has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance(layout, ak.layout.NumpyArray): + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="data", partition=str(part)) array = numpy.asarray(layout) - key = key_format(node=str(key_index), attribute="data", partition=str(part)) container[key] = little_endian(array) - form = ak.forms.Form.from_numpy(array.dtype) return ak.forms.NumpyForm( layout.shape[1:], @@ -3245,7 +3004,7 @@ def fill(layout, part): form.format, has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance(layout, ak.layout.RecordArray): @@ -3264,7 +3023,7 @@ def fill(layout, part): keys, has_identities, parameters, - form_key_format(node=str(key_index)), + form_key(id=str(key_index), layout=layout), ) elif isinstance(layout, ak.layout.RegularArray): @@ -3273,7 +3032,7 @@ def fill(layout, part): layout.size, has_identities, parameters, - form_key_format(node=str(key_index)), + form_key(id=str(key_index), layout=layout), ) elif isinstance( @@ -3288,21 +3047,18 @@ def fill(layout, part): for x in layout.contents: forms.append(fill(x, part)) - key = key_format(node=str(key_index), attribute="tags", partition=str(part)) + fk = form_key(id=str(key_index), layout=layout) + key = key_format(form_key=fk, attribute="tags", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.tags)) - - key = key_format( - node=str(key_index), attribute="index", partition=str(part) - ) + key = key_format(form_key=fk, attribute="index", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.index)) - return ak.forms.UnionForm( index_form(layout.tags), index_form(layout.index), forms, has_identities, parameters, - form_key_format(node=str(key_index)), + fk, ) elif isinstance(layout, ak.layout.VirtualArray): @@ -3323,7 +3079,7 @@ def fill(layout, part): for part, content in enumerate(layout.partitions): num_form_keys[0] = 0 - f = fill(content, partition + part) + f = fill(content, partition_start + part) if form is None: form = f @@ -3336,7 +3092,7 @@ def fill(layout, part): differs from the first Form: {2}""".format( - partition + part, + partition_start + part, f.tojson(True, False), form.tojson(True, False), ) @@ -3345,7 +3101,7 @@ def fill(layout, part): length.append(len(content)) else: - form = fill(layout, partition) + form = fill(layout, partition_start) length = len(layout) return form, length, container @@ -3357,13 +3113,11 @@ def fill(layout, part): def _form_to_layout( form, container, - partition, - prefix, - sep, - partition_first, - cache=None, - cache_key=None, - length=None, + partnum, + key_format, + length, + lazy_cache, + lazy_cache_key, ): global _index_form_to_dtype, _index_form_to_index, _form_to_layout_class @@ -3410,14 +3164,11 @@ def _form_to_layout( identities = None parameters = form.parameters + fk = form.form_key if isinstance(form, ak.forms.BitMaskedForm): raw_mask = ( - container[ - _buffers_key( - form.form_key, "mask", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="mask", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3428,13 +3179,11 @@ def _form_to_layout( content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, len(mask), + lazy_cache, + lazy_cache_key, ) return ak.layout.BitMaskedArray( @@ -3449,11 +3198,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.ByteMaskedForm): raw_mask = ( - container[ - _buffers_key( - form.form_key, "mask", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="mask", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3464,13 +3209,11 @@ def _form_to_layout( content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, len(mask), + lazy_cache, + lazy_cache_key, ) return ak.layout.ByteMaskedArray( @@ -3482,11 +3225,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.IndexedForm): raw_index = ( - container[ - _buffers_key( - form.form_key, "index", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="index", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3497,13 +3236,11 @@ def _form_to_layout( content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, - numpy.max(index) + 1, + partnum, + key_format, + 0 if len(index) == 0 else numpy.max(index) + 1, + lazy_cache, + lazy_cache_key, ) return _form_to_layout_class[type(form), form.index]( @@ -3512,11 +3249,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.IndexedOptionForm): raw_index = ( - container[ - _buffers_key( - form.form_key, "index", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="index", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3527,13 +3260,11 @@ def _form_to_layout( content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, - numpy.max(index) + 1, + partnum, + key_format, + 0 if len(index) == 0 else max(0, numpy.max(index) + 1), + lazy_cache, + lazy_cache_key, ) return _form_to_layout_class[type(form), form.index]( @@ -3542,11 +3273,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.ListForm): raw_starts = ( - container[ - _buffers_key( - form.form_key, "starts", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="starts", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3554,11 +3281,7 @@ def _form_to_layout( raw_starts.view(_index_form_to_dtype[form.starts]) ) raw_stops = ( - container[ - _buffers_key( - form.form_key, "stops", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="stops", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3566,16 +3289,17 @@ def _form_to_layout( raw_stops.view(_index_form_to_dtype[form.stops]) ) + array_starts = numpy.asarray(starts) + array_stops = numpy.asarray(stops)[:len(array_starts)] + array_stops = array_stops[array_starts != array_stops] content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, - stops[-1], + partnum, + key_format, + 0 if len(array_stops) == 0 else numpy.max(array_stops), + lazy_cache, + lazy_cache_key, ) return _form_to_layout_class[type(form), form.starts]( @@ -3584,11 +3308,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.ListOffsetForm): raw_offsets = ( - container[ - _buffers_key( - form.form_key, "offsets", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="offsets", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3599,13 +3319,11 @@ def _form_to_layout( content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, offsets[-1], + lazy_cache, + lazy_cache_key, ) return _form_to_layout_class[type(form), form.offsets]( @@ -3614,11 +3332,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.NumpyForm): raw_array = ( - container[ - _buffers_key( - form.form_key, None, partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="data", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3646,13 +3360,11 @@ def _form_to_layout( content = _form_to_layout( content_form, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, length, + lazy_cache, + lazy_cache_key, ) if minlength is None: minlength = len(content) @@ -3665,16 +3377,17 @@ def _form_to_layout( ) elif isinstance(form, ak.forms.RegularForm): + if length is None: + length = 0 + content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, length * form.size, + lazy_cache, + lazy_cache_key, ) return ak.layout.RegularArray( @@ -3683,11 +3396,7 @@ def _form_to_layout( elif isinstance(form, ak.forms.UnionForm): raw_tags = ( - container[ - _buffers_key( - form.form_key, "tags", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="tags", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3695,11 +3404,7 @@ def _form_to_layout( raw_tags.view(_index_form_to_dtype[form.tags]) ) raw_index = ( - container[ - _buffers_key( - form.form_key, "index", partition, prefix, sep, partition_first, - ) - ] + container[key_format(form_key=fk, attribute="index", partition=partnum)] .reshape(-1) .view("u1") ) @@ -3708,19 +3413,17 @@ def _form_to_layout( ) contents = [] - for i, x in enumerate(form.contents): - applicable_indices = numpy.array(index)[numpy.equal(tags, i)] + for i, content_form in enumerate(form.contents): + mine = numpy.array(index)[numpy.equal(tags, i)] contents.append( _form_to_layout( - x, + content_form, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, - numpy.max(applicable_indices) + 1, + partnum, + key_format, + 0 if len(mine) == 0 else numpy.max(mine) + 1, + lazy_cache, + lazy_cache_key, ) ) @@ -3732,13 +3435,11 @@ def _form_to_layout( content = _form_to_layout( form.content, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, length, + lazy_cache, + lazy_cache_key, ) return ak.layout.UnmaskedArray(content, identities, parameters) @@ -3747,22 +3448,18 @@ def _form_to_layout( args = ( form.form, container, - partition, - prefix, - sep, - partition_first, - cache, - cache_key, + partnum, + key_format, length, + lazy_cache, + lazy_cache_key, ) generator = ak.layout.ArrayGenerator( _form_to_layout, args, form=form.form, length=length, ) - node_cache_key = _buffers_key( - form.form.form_key, "virtual", partition, prefix, sep, partition_first, - ) + node_cache_key = key_format(form_key=form.form.form_key, attribute="virtual", partition=partnum) return ak.layout.VirtualArray( - generator, cache, cache_key + sep + node_cache_key + generator, lazy_cache, "{0}({1})".format(lazy_cache_key, node_cache_key) ) else: @@ -3809,6 +3506,354 @@ def modify(form): return ak.forms.Form.fromjson(json.dumps(form)) +def from_buffers( + form, + length, + container, + partition_start=0, + key_format="part{partition}-{form_key}-{attribute}", + lazy=False, + lazy_cache="new", + lazy_cache_key=None, + highlevel=True, + behavior=None, +): + u""" + Args: + form (#ak.forms.Form or str/dict equivalent): The form of the Awkward + Array to reconstitute from named buffers. + length (int or iterable of int): Length of the array to reconstitute as a + non-partitioned array or the lengths (plural) of partitions in a + partitioned array. + container (Mapping, such as dict): The str \u2192 Python buffers that + represent the decomposed Awkward Array. This `container` is only + assumed to have a `__getitem__` method that accepts strings as keys. + partition_start (int): First (or only) partition number to get from the + `container`. + key_format (str or callable): Python format string containing + `"{partition}"`, `"{form_key}"`, and/or `"{attribute}"` or a function + that takes these as keyword arguments and returns a string to use + as keys for buffers in the `container`. The `partition` is a + partition number (non-negative integer, passed as a string), the + `form_key` is a string associated with each node in the Form, and the + `attribute` is a hard-coded string representing the buffer's function + (e.g. `"data"`, `"offsets"`, `"index"`). + lazy (bool): If True, read the array or its partitions on demand (as + #ak.layout.VirtualArray, possibly in #ak.partition.PartitionedArray + if `num_partitions` is not None); if False, read all requested data + immediately. Any RecordArray child nodes will additionally be + read on demand. + lazy_cache (None, "new", or MutableMapping): If lazy, pass this + cache to the VirtualArrays. If "new", a new dict (keep-forever cache) + is created. If None, no cache is used. + lazy_cache_key (None or str): If lazy, pass this cache_key to the + VirtualArrays. If None, a process-unique string is constructed. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.layout.Content subclass. + behavior (bool): Custom #ak.behavior for the output array, if + high-level. + """ + + if isinstance(form, str) or (ak._util.py27 and isinstance(form, ak._util.unicode)): + form = ak.forms.Form.fromjson(form) + elif isinstance(form, dict): + form = ak.forms.Form.fromjson(json.dumps(form)) + + if isinstance(key_format, str): + key_format = lambda **v: key_format.format(**v) # noqa: E731 + + hold_cache = None + if lazy: + form = _wrap_record_with_virtual(form) + + if lazy_cache == "new": + hold_cache = ak._util.MappingProxy({}) + lazy_cache = ak.layout.ArrayCache(hold_cache) + elif lazy_cache is not None and not isinstance( + lazy_cache, ak.layout.ArrayCache + ): + hold_cache = ak._util.MappingProxy.maybe_wrap(lazy_cache) + if not isinstance(hold_cache, MutableMapping): + raise TypeError("lazy_cache must be a MutableMapping") + lazy_cache = ak.layout.ArrayCache(hold_cache) + + if lazy_cache_key is None: + lazy_cache_key = "ak.from_buffers:{0}".format(_from_buffers_key()) + + if length is None or isinstance(length, (numbers.Integral, np.integer)): + if length is None: + print("FIXME: remember to deprecate") + + args = (form, container, str(partition_start), key_format, length) + + if lazy: + generator = ak.layout.ArrayGenerator( + _form_to_layout, + args + (lazy_cache, lazy_cache_key), + form=form, + length=length, + ) + out = ak.layout.VirtualArray(generator, lazy_cache, lazy_cache_key) + + else: + out = _form_to_layout(*(args + (None, None))) + + elif isinstance(length, Iterable): + partitions = [] + offsets = [0] + + for part, partlen in enumerate(length): + partnum = str(partition_start + part) + args = (form, container, partnum, key_format) + + if lazy: + lazy_cache_key_part = "{0}[{1}]".format(lazy_cache_key, partnum) + generator = ak.layout.ArrayGenerator( + _form_to_layout, + args + (partlen, lazy_cache, lazy_cache_key_part), + form=form, + length=length[part], + ) + + partitions.append( + ak.layout.VirtualArray(generator, lazy_cache, lazy_cache_key_part) + ) + offsets.append(offsets[-1] + length[part]) + + else: + partitions.append(_form_to_layout(*(args + (partlen, None, None)))) + offsets.append(offsets[-1] + len(partitions[-1])) + + out = ak.partition.IrregularlyPartitionedArray(partitions, offsets[1:]) + + else: + raise TypeError( + "length must be an integer or an iterable of integers, not " + + repr(length) + + ak._util.exception_suffix(__file__) + ) + + if highlevel: + return ak._util.wrap(out, behavior) + else: + return out + + +def to_arrayset( + array, + container=None, + partition=None, + prefix=None, + node_format="node{0}", + partition_format="part{0}", + sep="-", + partition_first=False, +): + u""" + Args: + array: Data to decompose into an arrayset. + container (None or MutableMapping): The str \u2192 NumPy arrays (or + Python buffers) that represent the decomposed Awkward Array. This + `container` is only assumed to have a `__setitem__` method that + accepts strings as keys. + partition (None or non-negative int): If None and `array` is not + partitioned, keys written to the container have no reference to + partitioning; if an integer and `array` is not partitioned, keys + use this as their partition number; if `array` is partitioned, the + `partition` argument must be None and keys are written with the + array's own internal partition numbers. + prefix (None or str): If None, keys only contain node and partition + information; if a string, keys are all prepended by `prefix + sep`. + node_format (str or callable): Python format string or function + (returning str) of the node part of keys written to the container + and the `form_key` values in the output Form. Its only argument + (`{0}` in the format string) is the node number, unique within the + `array`. + partition_format (str or callable): Python format string or function + (returning str) of the partition part of keys written to the + container (if any). Its only argument (`{0}` in the format string) + is the partition number. + sep (str): Separates the prefix, node part, array attribute (e.g. + `"starts"`, `"stops"`, `"mask"`), and partition part of the + keys written to the container. + partition_first (bool): If True, the partition part appears immediately + after the prefix (if any); if False, the partition part appears + at the end of the keys. This can be relevant if the `container` + is sorted or lookup performance depends on alphabetical order. + + **Deprecated:** this will be removed in `awkward>=1.1.0` after January 1, + 2021. Use #ak.to_buffers instead: the arguments and return values have changed. + + Decomposes an Awkward Array into a Form and a collection of arrays, so + that data can be losslessly written to file formats and storage devices + that only understand named arrays (or binary blobs). + + This function returns a 3-tuple: + + (form, container, num_partitions) + + where the `form` is a #ak.forms.Form (which can be converted to JSON + with `tojson`), the `container` is either the MutableMapping you passed in + or a new dict containing the NumPy arrays, and `num_partitions` is None + if `array` was not partitioned or the number of partitions if it was. + + These are also the first three arguments of #ak.from_arrayset, so a full + round-trip is + + >>> reconstituted = ak.from_arrayset(*ak.to_arrayset(original)) + + The `container` argument lets you specify your own MutableMapping, which + might be an interface to some storage format or device (e.g. h5py). It's + okay if the `container` drops NumPy's `dtype` and `shape` information, + leaving raw bytes, since `dtype` and `shape` can be reconstituted from + the #ak.forms.NumpyForm. + + The `partition` argument lets you fill the `container` one partition at a + time using unpartitioned arrays. + + The rest of the arguments determine the format of the keys written to the + `container` (which might be restrictive if it represents a storage device). + + Here is a simple example: + + >>> original = ak.Array([[1, 2, 3], [], [4, 5]]) + >>> form, container, num_partitions = ak.to_arrayset(original) + >>> form + { + "class": "ListOffsetArray64", + "offsets": "i64", + "content": { + "class": "NumpyArray", + "itemsize": 8, + "format": "l", + "primitive": "int64", + "form_key": "node1" + }, + "form_key": "node0" + } + >>> container + {'node0-offsets': array([0, 3, 3, 5], dtype=int64), + 'node1': array([1, 2, 3, 4, 5])} + >>> print(num_partitions) + None + + which may be read back with + + >>> ak.from_arrayset(form, container) + + + (the third argument of #ak.from_arrayset defaults to None). + + Here is an example of building up a partitioned array: + + >>> container = {} + >>> form, _, _ = ak.to_arrayset(ak.Array([[1, 2, 3], [], [4, 5]]), container, 0) + >>> form, _, _ = ak.to_arrayset(ak.Array([[6, 7, 8, 9]]), container, 1) + >>> form, _, _ = ak.to_arrayset(ak.Array([[], [], []]), container, 2) + >>> form, _, _ = ak.to_arrayset(ak.Array([[10]]), container, 3) + >>> form + { + "class": "ListOffsetArray64", + "offsets": "i64", + "content": { + "class": "NumpyArray", + "itemsize": 8, + "format": "l", + "primitive": "int64", + "form_key": "node1" + }, + "form_key": "node0" + } + >>> container + {'node0-offsets-part0': array([0, 3, 3, 5], dtype=int64), + 'node1-part0': array([1, 2, 3, 4, 5]), + 'node0-offsets-part1': array([0, 4], dtype=int64), + 'node1-part1': array([6, 7, 8, 9]), + 'node0-offsets-part2': array([0, 0, 0, 0], dtype=int64), + 'node1-part2': array([], dtype=float64), + 'node0-offsets-part3': array([0, 1], dtype=int64), + 'node1-part3': array([10])} + + The object returned by #ak.from_arrayset is now a partitioned array: + + >>> ak.from_arrayset(form, container, 4) + + >>> ak.partitions(ak.from_arrayset(form, container, 4)) + [3, 1, 3, 1] + + See also #ak.from_arrayset. + """ + + print("FIXME: remember to deprecate") + + layout = to_layout(array, allow_record=False, allow_other=False) + + if isinstance(layout, ak.partition.PartitionedArray): + show_partition = True + if partition is not None: + raise ValueError( + "array is partitioned; an explicit 'partition' should not be " + "assigned" + ak._util.exception_suffix(__file__) + ) + else: + if partition is None: + show_partition = False + else: + show_partition = True + + if partition is None: + partition_start = 0 + else: + partition_start = partition + + def form_key(**v): + return "node{id}".format(**v) + + def key_format(**v): + v["sep"] = sep + if prefix is None: + v["prefix"] = "" + else: + v["prefix"] = prefix + sep + + if not show_partition: + if v["attribute"] == "data": + return "{prefix}{form_key}".format(**v) + else: + return "{prefix}{form_key}{sep}{attribute}".format(**v) + + elif partition_first: + if v["attribute"] == "data": + return "{prefix}part{partition}{sep}{form_key}".format(**v) + else: + return "{prefix}part{partition}{sep}{form_key}{sep}{attribute}".format( + **v + ) + + else: + if v["attribute"] == "data": + return "{prefix}{form_key}{sep}part{partition}".format(**v) + else: + return "{prefix}{form_key}{sep}{attribute}{sep}part{partition}".format( + **v + ) + + form, length, container = to_buffers( + layout, + container=container, + partition_start=partition_start, + form_key=form_key, + key_format=key_format, + ) + + if isinstance(length, (numbers.Integral, np.integer)): + num_partitions = None + else: + num_partitions = len(length) + + return form, container, num_partitions + + def from_arrayset( form, container, @@ -3872,6 +3917,9 @@ def from_arrayset( behavior (bool): Custom #ak.behavior for the output array, if high-level. + **Deprecated:** this will be removed in `awkward>=1.1.0` after January 1, + 2021. Use #ak.from_buffers instead: the arguments have changed. + Reconstructs an Awkward Array from a Form and a collection of arrays, so that data can be losslessly read from file formats and storage devices that only understand named arrays (or binary blobs). @@ -3903,15 +3951,7 @@ def from_arrayset( See #ak.to_arrayset for examples. """ - if isinstance(form, str) or (ak._util.py27 and isinstance(form, ak._util.unicode)): - form = ak.forms.Form.fromjson(form) - elif isinstance(form, dict): - form = ak.forms.Form.fromjson(json.dumps(form)) - - if prefix is None: - prefix = "" - else: - prefix = prefix + sep + print("FIXME: remember to deprecate") if isinstance(partition_format, str) or ( ak._util.py27 and isinstance(partition_format, ak._util.unicode) @@ -3921,102 +3961,90 @@ def from_arrayset( def partition_format(x): return tmp2.format(x) - hold_cache = None - if lazy: - form = _wrap_record_with_virtual(form) - - if lazy_cache == "new": - hold_cache = ak._util.MappingProxy({}) - lazy_cache = ak.layout.ArrayCache(hold_cache) - elif lazy_cache == "attach": - raise TypeError("lazy_cache must be a MutableMapping") - hold_cache = ak._util.MappingProxy({}) - lazy_cache = ak.layout.ArrayCache(hold_cache) - elif lazy_cache is not None and not isinstance( - lazy_cache, ak.layout.ArrayCache - ): - hold_cache = ak._util.MappingProxy.maybe_wrap(lazy_cache) - if not isinstance(hold_cache, MutableMapping): - raise TypeError("lazy_cache must be a MutableMapping") - lazy_cache = ak.layout.ArrayCache(hold_cache) - - if lazy_cache_key is None: - lazy_cache_key = "ak.from_arrayset:{0}".format(_from_buffers_key()) - if num_partitions is None: - args = (form, container, None, prefix, sep, partition_first) + show_partition = False - if lazy: - if not isinstance(lazy_lengths, numbers.Integral): + if lazy_lengths is None: + if lazy: raise TypeError( "for lazy=True and num_partitions=None, lazy_lengths " "must be an integer, not " + repr(lazy_lengths) + ak._util.exception_suffix(__file__) ) + length = None - generator = ak.layout.ArrayGenerator( - _form_to_layout, - args + (lazy_cache, lazy_cache_key, lazy_lengths), - form=form, - length=lazy_lengths, - ) - - out = ak.layout.VirtualArray(generator, lazy_cache, lazy_cache_key) + elif isinstance(lazy_lengths, (numbers.Integral, np.integer)): + length = lazy_lengths else: - out = _form_to_layout(*args) + raise TypeError( + "for num_partitions=None, lazy_lengths " + "must be None or an integer, not " + + repr(lazy_lengths) + + ak._util.exception_suffix(__file__) + ) else: - if lazy: - if isinstance(lazy_lengths, numbers.Integral): - lazy_lengths = [lazy_lengths] * num_partitions - elif ( - isinstance(lazy_lengths, Iterable) - and len(lazy_lengths) == num_partitions - and all(isinstance(x, numbers.Integral) for x in lazy_lengths) - ): - pass - else: + show_partition = True + + if lazy_lengths is None: + if lazy: raise TypeError( - "for lazy=True, lazy_lengths must be an integer or " - "iterable of 'num_partitions' integers, not " + "for lazy=True and isinstance(num_partitions, int), lazy_lengths " + "must be an iterable of 'num_partitions' integers, not " + repr(lazy_lengths) + ak._util.exception_suffix(__file__) ) + length = [None] * num_partitions - partitions = [] - offsets = [0] + elif isinstance(lazy_lengths, (numbers.Integral, np.integer)): + length = [lazy_lengths] * num_partitions - for part in range(num_partitions): - p = partition_format(part) - args = (form, container, p, prefix, sep, partition_first) + else: + length = lazy_lengths - if lazy: - cache_key = "{0}[{1}]".format(lazy_cache_key, part) + def key_format(**v): + v["sep"] = sep + if prefix is None: + v["prefix"] = "" + else: + v["prefix"] = prefix + sep - generator = ak.layout.ArrayGenerator( - _form_to_layout, - args + (lazy_cache, cache_key, lazy_lengths[part]), - form=form, - length=lazy_lengths[part], - ) + if not show_partition: + if v["attribute"] == "data": + return "{prefix}{form_key}".format(**v) + else: + return "{prefix}{form_key}{sep}{attribute}".format(**v) - partitions.append( - ak.layout.VirtualArray(generator, lazy_cache, cache_key) + elif partition_first: + if v["attribute"] == "data": + return "{prefix}part{partition}{sep}{form_key}".format(**v) + else: + return "{prefix}part{partition}{sep}{form_key}{sep}{attribute}".format( + **v ) - offsets.append(offsets[-1] + lazy_lengths[part]) + else: + if v["attribute"] == "data": + return "{prefix}{form_key}{sep}part{partition}".format(**v) else: - partitions.append(_form_to_layout(*args)) - offsets.append(offsets[-1] + len(partitions[-1])) - - out = ak.partition.IrregularlyPartitionedArray(partitions, offsets[1:]) + return "{prefix}{form_key}{sep}{attribute}{sep}part{partition}".format( + **v + ) - if highlevel: - return ak._util.wrap(out, behavior) - else: - return out + return from_buffers( + form, + length, + container, + partition_start=0, + key_format=key_format, + lazy=lazy, + lazy_cache=lazy_cache, + lazy_cache_key=lazy_cache_key, + highlevel=highlevel, + behavior=behavior, + ) def to_pandas( diff --git a/tests/test_0348-form-keys.py b/tests/test_0348-form-keys.py index e14694f9e2..7334676452 100644 --- a/tests/test_0348-form-keys.py +++ b/tests/test_0348-form-keys.py @@ -206,7 +206,6 @@ def test_record(): ).tolist() == {"x": 2.2, "y": [1, 2]} -@pytest.mark.skip(reason="FIXME: arrayset has to be given lengths; it's required") def test_regulararray(): content = ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]).layout regulararray = ak.layout.RegularArray(content, 3, zeros_length=0) diff --git a/tests/test_0384-lazy-arrayset.py b/tests/test_0384-lazy-arrayset.py index 43991c8ec2..077e6b6e7b 100644 --- a/tests/test_0384-lazy-arrayset.py +++ b/tests/test_0384-lazy-arrayset.py @@ -89,7 +89,7 @@ def test_lazy_arrayset(): assert ak.to_list(ak.num(out.listcollection)) == [3, 3, 3] assert set(canary.ops) == {("get", "kitty-node1-offsets")} - assert set(cache) == {"hello", "hello-kitty-node1-virtual"} + assert set(cache) == {"hello", "hello(kitty-node1-virtual)"} canary.ops = [] cache.clear() @@ -107,16 +107,16 @@ def test_lazy_arrayset(): } assert set(cache) == { "hello", - "hello-kitty-node11-virtual", - "hello-kitty-node13-virtual", - "hello-kitty-node16-virtual", + "hello(kitty-node11-virtual)", + "hello(kitty-node13-virtual)", + "hello(kitty-node16-virtual)", } canary.ops = [] cache.clear() assert ak.to_list(out.masked) == [None, 4, 4] assert set(canary.ops) == {("get", "kitty-node17-index"), ("get", "kitty-node18")} - assert set(cache) == {"hello", "hello-kitty-node17-virtual"} + assert set(cache) == {"hello", "hello(kitty-node17-virtual)"} canary.ops = [] cache.clear() From 92495f7fdbf41472f7d369d43adddf21021b2cf7 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 10 Dec 2020 21:45:38 -0600 Subject: [PATCH 4/8] All the deprecation messages are in place. --- src/awkward/operations/convert.py | 117 ++++++++++++++++++++++++------ tests/test_0348-form-keys.py | 32 ++++++++ tests/test_0384-lazy-arrayset.py | 4 + 3 files changed, 129 insertions(+), 24 deletions(-) diff --git a/src/awkward/operations/convert.py b/src/awkward/operations/convert.py index 821d694fea..86b2ca6c0d 100644 --- a/src/awkward/operations/convert.py +++ b/src/awkward/operations/convert.py @@ -2734,7 +2734,7 @@ def to_buffers( `attribute` is a hard-coded string representing the buffer's function (e.g. `"data"`, `"offsets"`, `"index"`). - Decomposes an Awkward Array into a Form and a collection of Python buffers, + Decomposes an Awkward Array into a Form and a collection of memory buffers, so that data can be losslessly written to file formats and storage devices that only map names to binary blobs (such as a filesystem directory). @@ -2799,9 +2799,13 @@ def to_buffers( >>> container = {} >>> lengths = [] >>> form, length, _ = ak.to_buffers(ak.Array([[1, 2, 3], [], [4, 5]]), container, 0) + >>> lengths.append(length) >>> form, length, _ = ak.to_buffers(ak.Array([[6, 7, 8, 9]]), container, 1) + >>> lengths.append(length) >>> form, length, _ = ak.to_buffers(ak.Array([[], [], []]), container, 2) + >>> lengths.append(length) >>> form, length, _ = ak.to_buffers(ak.Array([[10]]), container, 3) + >>> lengths.append(length) >>> form { "class": "ListOffsetArray64", @@ -2815,6 +2819,8 @@ def to_buffers( }, "form_key": "node0" } + >>> lengths + [3, 1, 3, 1] >>> container {'part0-node0-offsets': array([0, 3, 3, 5], dtype=int64), 'part0-node1-data': array([1, 2, 3, 4, 5]), @@ -2857,10 +2863,24 @@ def index_form(index): ) if isinstance(form_key, str): - form_key = lambda **v: form_key.format(**v) # noqa: E731 + + def generate_form_key(form_key): + def fk(**v): + return form_key.format(**v) + + return fk + + form_key = generate_form_key(form_key) if isinstance(key_format, str): - key_format = lambda **v: key_format.format(**v) # noqa: E731 + + def generate_key_format(key_format): + def kf(**v): + return key_format.format(**v) + + return kf + + key_format = generate_key_format(key_format) num_form_keys = [0] @@ -2980,9 +3000,7 @@ def fill(layout, part): ), ): fk = form_key(id=str(key_index), layout=layout) - key = key_format( - form_key=fk, attribute="offsets", partition=str(part) - ) + key = key_format(form_key=fk, attribute="offsets", partition=str(part)) container[key] = little_endian(numpy.asarray(layout.offsets)) return ak.forms.ListOffsetForm( index_form(layout.offsets), @@ -3111,13 +3129,7 @@ def fill(layout, part): def _form_to_layout( - form, - container, - partnum, - key_format, - length, - lazy_cache, - lazy_cache_key, + form, container, partnum, key_format, length, lazy_cache, lazy_cache_key, ): global _index_form_to_dtype, _index_form_to_index, _form_to_layout_class @@ -3290,7 +3302,7 @@ def _form_to_layout( ) array_starts = numpy.asarray(starts) - array_stops = numpy.asarray(stops)[:len(array_starts)] + array_stops = numpy.asarray(stops)[: len(array_starts)] array_stops = array_stops[array_starts != array_stops] content = _form_to_layout( form.content, @@ -3372,8 +3384,10 @@ def _form_to_layout( minlength = min(minlength, len(content)) contents.append(content) + if length is None: + length = minlength return ak.layout.RecordArray( - contents, None if form.istuple else keys, minlength, identities, parameters, + contents, None if form.istuple else keys, length, identities, parameters, ) elif isinstance(form, ak.forms.RegularForm): @@ -3457,7 +3471,9 @@ def _form_to_layout( generator = ak.layout.ArrayGenerator( _form_to_layout, args, form=form.form, length=length, ) - node_cache_key = key_format(form_key=form.form.form_key, attribute="virtual", partition=partnum) + node_cache_key = key_format( + form_key=form.form.form_key, attribute="virtual", partition=partnum + ) return ak.layout.VirtualArray( generator, lazy_cache, "{0}({1})".format(lazy_cache_key, node_cache_key) ) @@ -3552,6 +3568,30 @@ def from_buffers( a low-level #ak.layout.Content subclass. behavior (bool): Custom #ak.behavior for the output array, if high-level. + + Reconstitutes an Awkward Array from a Form, length, and a collection of memory + buffers, so that data can be losslessly read from file formats and storage + devices that only map names to binary blobs (such as a filesystem directory). + + The first three arguments of this function are the return values of + #ak.to_buffers, so a full round-trip is + + >>> reconstituted = ak.from_buffers(*ak.to_buffers(original)) + + The `container` argument lets you specify your own Mapping, which might be + an interface to some storage format or device (e.g. h5py). It's okay if + the `container` dropped NumPy's `dtype` and `shape` information, leaving + raw bytes, since `dtype` and `shape` can be reconstituted from the + #ak.forms.NumpyForm. + + The `key_format` should be the same as the one used in #ak.to_buffers. + + The arguments that begin with `lazy_` are only needed if `lazy` is True. + The `lazy_cache` and `lazy_cache_key` determine how the array or its + partitions are cached after being read from the `container` (in a no-eviction + dict attached to the output #ak.Array as `cache` if not specified). + + See #ak.to_buffers for examples. """ if isinstance(form, str) or (ak._util.py27 and isinstance(form, ak._util.unicode)): @@ -3560,7 +3600,14 @@ def from_buffers( form = ak.forms.Form.fromjson(json.dumps(form)) if isinstance(key_format, str): - key_format = lambda **v: key_format.format(**v) # noqa: E731 + + def generate_key_format(key_format): + def kf(**v): + return key_format.format(**v) + + return kf + + key_format = generate_key_format(key_format) hold_cache = None if lazy: @@ -3582,7 +3629,14 @@ def from_buffers( if length is None or isinstance(length, (numbers.Integral, np.integer)): if length is None: - print("FIXME: remember to deprecate") + ak._util.deprecate( + TypeError( + "length must be an integer or an iterable of integers" + + ak._util.exception_suffix(__file__) + ), + "1.1.0", + "January 1, 2021", + ) args = (form, container, str(partition_start), key_format, length) @@ -3681,8 +3735,9 @@ def to_arrayset( at the end of the keys. This can be relevant if the `container` is sorted or lookup performance depends on alphabetical order. - **Deprecated:** this will be removed in `awkward>=1.1.0` after January 1, - 2021. Use #ak.to_buffers instead: the arguments and return values have changed. + **Deprecated:** This will be removed in `awkward>=1.1.0` (target date: + January 1, 2021). Use #ak.to_buffers instead: the arguments and return + values have changed. Decomposes an Awkward Array into a Form and a collection of arrays, so that data can be losslessly written to file formats and storage devices @@ -3784,7 +3839,14 @@ def to_arrayset( See also #ak.from_arrayset. """ - print("FIXME: remember to deprecate") + ak._util.deprecate( + TypeError( + "ak.to_arrayset is deprecated; use ak.to_buffers instead" + + ak._util.exception_suffix(__file__) + ), + "1.1.0", + "January 1, 2021", + ) layout = to_layout(array, allow_record=False, allow_other=False) @@ -3917,8 +3979,8 @@ def from_arrayset( behavior (bool): Custom #ak.behavior for the output array, if high-level. - **Deprecated:** this will be removed in `awkward>=1.1.0` after January 1, - 2021. Use #ak.from_buffers instead: the arguments have changed. + **Deprecated:** This will be removed in `awkward>=1.1.0` (target date: + January 1, 2021). Use #ak.from_buffers instead: the arguments have changed. Reconstructs an Awkward Array from a Form and a collection of arrays, so that data can be losslessly read from file formats and storage devices that @@ -3951,7 +4013,14 @@ def from_arrayset( See #ak.to_arrayset for examples. """ - print("FIXME: remember to deprecate") + ak._util.deprecate( + TypeError( + "ak.from_arrayset is deprecated; use ak.from_buffers instead" + + ak._util.exception_suffix(__file__) + ), + "1.1.0", + "January 1, 2021", + ) if isinstance(partition_format, str) or ( ak._util.py27 and isinstance(partition_format, ak._util.unicode) diff --git a/tests/test_0348-form-keys.py b/tests/test_0348-form-keys.py index 7334676452..13fe5f5f30 100644 --- a/tests/test_0348-form-keys.py +++ b/tests/test_0348-form-keys.py @@ -15,6 +15,8 @@ def test_numpyarray(): + ak.deprecations_as_errors = False + assert ak.from_arrayset(*ak.to_arrayset([1, 2, 3, 4, 5])).tolist() == [ 1, 2, @@ -32,6 +34,8 @@ def test_numpyarray(): def test_listoffsetarray(): + ak.deprecations_as_errors = False + assert ak.from_arrayset(*ak.to_arrayset([[1, 2, 3], [], [4, 5]])).tolist() == [ [1, 2, 3], [], @@ -49,6 +53,8 @@ def test_listoffsetarray(): def test_listarray(): + ak.deprecations_as_errors = False + listoffsetarray = ak.Array([[1, 2, 3], [], [4, 5]]).layout listarray = ak.layout.ListArray64( listoffsetarray.starts, listoffsetarray.stops, listoffsetarray.content @@ -66,6 +72,8 @@ def test_listarray(): def test_indexedoptionarray(): + ak.deprecations_as_errors = False + assert ak.from_arrayset(*ak.to_arrayset([1, 2, 3, None, None, 5])).tolist() == [ 1, 2, @@ -80,6 +88,8 @@ def test_indexedoptionarray(): def test_indexedarray(): + ak.deprecations_as_errors = False + content = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4]).layout index = ak.layout.Index64(np.array([3, 1, 1, 4, 2], dtype=np.int64)) indexedarray = ak.layout.IndexedArray64(index, content) @@ -100,6 +110,8 @@ def test_indexedarray(): def test_emptyarray(): + ak.deprecations_as_errors = False + assert ak.from_arrayset(*ak.to_arrayset([])).tolist() == [] assert ak.from_arrayset(*ak.to_arrayset([[], [], []])).tolist() == [[], [], []] @@ -112,6 +124,8 @@ def test_emptyarray(): def test_bytemaskedarray(): + ak.deprecations_as_errors = False + content = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4]).layout mask = ak.layout.Index8(np.array([False, True, True, False, False], dtype=np.int8)) bytemaskedarray = ak.layout.ByteMaskedArray(mask, content, True) @@ -132,6 +146,8 @@ def test_bytemaskedarray(): def test_bitmaskedarray(): + ak.deprecations_as_errors = False + content = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4]).layout mask = ak.layout.IndexU8( np.packbits(np.array([False, True, True, False, False], dtype=np.int8)) @@ -154,6 +170,8 @@ def test_bitmaskedarray(): def test_recordarray(): + ak.deprecations_as_errors = False + assert ak.from_arrayset( *ak.to_arrayset([(1.1, [1]), (2.2, [1, 2]), (3.3, [1, 2, 3])]) ).tolist() == [(1.1, [1]), (2.2, [1, 2]), (3.3, [1, 2, 3])] @@ -189,6 +207,8 @@ def test_recordarray(): def test_record(): + ak.deprecations_as_errors = False + assert pickle.loads( pickle.dumps(ak.Record({"x": 2.2, "y": [1, 2]}), -1) ).tolist() == {"x": 2.2, "y": [1, 2]} @@ -207,6 +227,8 @@ def test_record(): def test_regulararray(): + ak.deprecations_as_errors = False + content = ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]).layout regulararray = ak.layout.RegularArray(content, 3, zeros_length=0) assert ak.from_arrayset(*ak.to_arrayset(regulararray)).tolist() == [ @@ -224,6 +246,8 @@ def test_regulararray(): def test_unionarray(): + ak.deprecations_as_errors = False + assert ak.from_arrayset(*ak.to_arrayset([[1, 2, 3], [], 4, 5])).tolist() == [ [1, 2, 3], [], @@ -239,6 +263,8 @@ def test_unionarray(): def test_unmaskedarray(): + ak.deprecations_as_errors = False + content = ak.Array([1, 2, 3, 4, 5]).layout unmaskedarray = ak.layout.UnmaskedArray(content) assert ak.from_arrayset(*ak.to_arrayset(unmaskedarray)).tolist() == [1, 2, 3, 4, 5] @@ -252,6 +278,8 @@ def test_unmaskedarray(): def test_partitioned(): + ak.deprecations_as_errors = False + array = ak.repartition(ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), 3) form, container, num_partitions = ak.to_arrayset(array, partition_first=True) @@ -298,6 +326,8 @@ def test_partitioned(): def test_lazy(): + ak.deprecations_as_errors = False + array = ak.Array([1, 2, 3, 4, 5]) form, container, num_partitions = ak.to_arrayset(array) @@ -308,6 +338,8 @@ def test_lazy(): def test_lazy_partitioned(): + ak.deprecations_as_errors = False + array = ak.repartition(ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), 3) form, container, num_partitions = ak.to_arrayset(array) assert num_partitions == 4 diff --git a/tests/test_0384-lazy-arrayset.py b/tests/test_0384-lazy-arrayset.py index 077e6b6e7b..beefa51456 100644 --- a/tests/test_0384-lazy-arrayset.py +++ b/tests/test_0384-lazy-arrayset.py @@ -22,6 +22,8 @@ def __setitem__(self, key, value): def test_lazy_arrayset(): + ak.deprecations_as_errors = False + array = ak.from_json( """ [ @@ -122,6 +124,8 @@ def test_lazy_arrayset(): def test_longer_than_expected(): + ak.deprecations_as_errors = False + array = ak.Array( ak.layout.ListOffsetArray64( ak.layout.Index64([0, 2, 4]), From acc21376edbc4b24ded265df7bc4557433f94bd4 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 11 Dec 2020 09:46:30 -0600 Subject: [PATCH 5/8] Pickle uses the new to_buffers/from_buffers, but old pickle files can still be read. --- src/awkward/_util.py | 28 +++++++++++++++++++++++++ src/awkward/highlevel.py | 34 +++++++++++++++++++++++-------- src/awkward/operations/convert.py | 8 -------- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/src/awkward/_util.py b/src/awkward/_util.py index bf1ec4d56d..3c1ca31b27 100644 --- a/src/awkward/_util.py +++ b/src/awkward/_util.py @@ -1697,3 +1697,31 @@ def union_to_record(unionarray, anonymous): ) return ak.layout.RecordArray(all_fields, all_names, len(unionarray)) + + +def adjust_old_pickle(form, container, num_partitions, behavior): + def key_format(**v): + if num_partitions is None: + if v["attribute"] == "data": + return "{form_key}".format(**v) + else: + return "{form_key}-{attribute}".format(**v) + + else: + if v["attribute"] == "data": + return "{form_key}-part{partition}".format(**v) + else: + return "{form_key}-{attribute}-part{partition}".format(**v) + + return ak.operations.convert.from_buffers( + form, + None, + container, + partition_start=0, + key_format=key_format, + lazy=False, + lazy_cache="new", + lazy_cache_key=None, + highlevel=False, + behavior=behavior, + ) diff --git a/src/awkward/highlevel.py b/src/awkward/highlevel.py index a75a245108..5996ad6505 100644 --- a/src/awkward/highlevel.py +++ b/src/awkward/highlevel.py @@ -1386,16 +1386,24 @@ def numba_type(self): return numba.typeof(self._numbaview) def __getstate__(self): - form, container, num_partitions = ak.to_arrayset(self) + form, length, container = ak.operations.convert.to_buffers(self._layout) if self._behavior is ak.behavior: behavior = None else: behavior = self._behavior - return form, container, num_partitions, behavior + return form, length, container, behavior def __setstate__(self, state): - form, container, num_partitions, behavior = state - layout = ak.from_arrayset(form, container, num_partitions, highlevel=False) + if isinstance(state[1], dict): + form, container, num_partitions, behavior = state + layout = ak._util.adjust_old_pickle( + form, container, num_partitions, behavior + ) + else: + form, length, container, behavior = state + layout = ak.operations.convert.from_buffers( + form, length, container, highlevel=False, behavior=behavior + ) if self.__class__ is Array: self.__class__ = ak._util.arrayclass(layout, behavior) self.layout = layout @@ -1975,17 +1983,25 @@ def numba_type(self): return numba.typeof(self._numbaview) def __getstate__(self): - form, container, num_partitions = ak.to_arrayset(self._layout.array) + form, length, container = ak.operations.convert.to_buffers(self._layout.array) if self._behavior is ak.behavior: behavior = None else: behavior = self._behavior - return form, container, num_partitions, behavior, self._layout.at + return form, length, container, behavior, self._layout.at def __setstate__(self, state): - form, container, num_partitions, behavior, at = state - array = ak.from_arrayset(form, container, num_partitions, highlevel=False) - layout = ak.layout.Record(array, at) + if isinstance(state[1], dict): + form, container, num_partitions, behavior, at = state + layout = ak._util.adjust_old_pickle( + form, container, num_partitions, behavior + ) + else: + form, length, container, behavior, at = state + layout = ak.operations.convert.from_buffers( + form, length, container, highlevel=False, behavior=behavior + ) + layout = ak.layout.Record(layout, at) if self.__class__ is Record: self.__class__ = ak._util.recordclass(layout, behavior) self.layout = layout diff --git a/src/awkward/operations/convert.py b/src/awkward/operations/convert.py index 86b2ca6c0d..0f355051fa 100644 --- a/src/awkward/operations/convert.py +++ b/src/awkward/operations/convert.py @@ -4022,14 +4022,6 @@ def from_arrayset( "January 1, 2021", ) - if isinstance(partition_format, str) or ( - ak._util.py27 and isinstance(partition_format, ak._util.unicode) - ): - tmp2 = partition_format - - def partition_format(x): - return tmp2.format(x) - if num_partitions is None: show_partition = False From 8cc01334d206254891d0d8141e37ba0d03675668 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 11 Dec 2020 13:33:15 -0600 Subject: [PATCH 6/8] Eliminated all warnings, references to 'arrayset' from the tests. --- tests/test_0348-form-keys.py | 144 ++++++++++++++++--------------- tests/test_0384-lazy-arrayset.py | 24 +++--- 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/tests/test_0348-form-keys.py b/tests/test_0348-form-keys.py index 13fe5f5f30..79a2570cae 100644 --- a/tests/test_0348-form-keys.py +++ b/tests/test_0348-form-keys.py @@ -15,9 +15,7 @@ def test_numpyarray(): - ak.deprecations_as_errors = False - - assert ak.from_arrayset(*ak.to_arrayset([1, 2, 3, 4, 5])).tolist() == [ + assert ak.from_buffers(*ak.to_buffers([1, 2, 3, 4, 5])).tolist() == [ 1, 2, 3, @@ -34,18 +32,16 @@ def test_numpyarray(): def test_listoffsetarray(): - ak.deprecations_as_errors = False - - assert ak.from_arrayset(*ak.to_arrayset([[1, 2, 3], [], [4, 5]])).tolist() == [ + assert ak.from_buffers(*ak.to_buffers([[1, 2, 3], [], [4, 5]])).tolist() == [ [1, 2, 3], [], [4, 5], ] - assert ak.from_arrayset( - *ak.to_arrayset(["one", "two", "three", "four", "five"]) + assert ak.from_buffers( + *ak.to_buffers(["one", "two", "three", "four", "five"]) ).tolist() == ["one", "two", "three", "four", "five"] - assert ak.from_arrayset( - *ak.to_arrayset([["one", "two", "three"], [], ["four", "five"]]) + assert ak.from_buffers( + *ak.to_buffers([["one", "two", "three"], [], ["four", "five"]]) ).tolist() == [["one", "two", "three"], [], ["four", "five"]] assert pickle.loads( pickle.dumps(ak.Array([[1, 2, 3], [], [4, 5]]), -1) @@ -53,13 +49,11 @@ def test_listoffsetarray(): def test_listarray(): - ak.deprecations_as_errors = False - listoffsetarray = ak.Array([[1, 2, 3], [], [4, 5]]).layout listarray = ak.layout.ListArray64( listoffsetarray.starts, listoffsetarray.stops, listoffsetarray.content ) - assert ak.from_arrayset(*ak.to_arrayset(listarray)).tolist() == [ + assert ak.from_buffers(*ak.to_buffers(listarray)).tolist() == [ [1, 2, 3], [], [4, 5], @@ -72,9 +66,7 @@ def test_listarray(): def test_indexedoptionarray(): - ak.deprecations_as_errors = False - - assert ak.from_arrayset(*ak.to_arrayset([1, 2, 3, None, None, 5])).tolist() == [ + assert ak.from_buffers(*ak.to_buffers([1, 2, 3, None, None, 5])).tolist() == [ 1, 2, 3, @@ -88,12 +80,10 @@ def test_indexedoptionarray(): def test_indexedarray(): - ak.deprecations_as_errors = False - content = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4]).layout index = ak.layout.Index64(np.array([3, 1, 1, 4, 2], dtype=np.int64)) indexedarray = ak.layout.IndexedArray64(index, content) - assert ak.from_arrayset(*ak.to_arrayset(indexedarray)).tolist() == [ + assert ak.from_buffers(*ak.to_buffers(indexedarray)).tolist() == [ 3.3, 1.1, 1.1, @@ -110,10 +100,8 @@ def test_indexedarray(): def test_emptyarray(): - ak.deprecations_as_errors = False - - assert ak.from_arrayset(*ak.to_arrayset([])).tolist() == [] - assert ak.from_arrayset(*ak.to_arrayset([[], [], []])).tolist() == [[], [], []] + assert ak.from_buffers(*ak.to_buffers([])).tolist() == [] + assert ak.from_buffers(*ak.to_buffers([[], [], []])).tolist() == [[], [], []] assert pickle.loads(pickle.dumps(ak.Array([]), -1)).tolist() == [] assert pickle.loads(pickle.dumps(ak.Array([[], [], []]), -1)).tolist() == [ @@ -124,12 +112,10 @@ def test_emptyarray(): def test_bytemaskedarray(): - ak.deprecations_as_errors = False - content = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4]).layout mask = ak.layout.Index8(np.array([False, True, True, False, False], dtype=np.int8)) bytemaskedarray = ak.layout.ByteMaskedArray(mask, content, True) - assert ak.from_arrayset(*ak.to_arrayset(bytemaskedarray)).tolist() == [ + assert ak.from_buffers(*ak.to_buffers(bytemaskedarray)).tolist() == [ None, 1.1, 2.2, @@ -146,14 +132,12 @@ def test_bytemaskedarray(): def test_bitmaskedarray(): - ak.deprecations_as_errors = False - content = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4]).layout mask = ak.layout.IndexU8( np.packbits(np.array([False, True, True, False, False], dtype=np.int8)) ) bitmaskedarray = ak.layout.BitMaskedArray(mask, content, True, 5, False) - assert ak.from_arrayset(*ak.to_arrayset(bitmaskedarray)).tolist() == [ + assert ak.from_buffers(*ak.to_buffers(bitmaskedarray)).tolist() == [ None, 1.1, 2.2, @@ -170,13 +154,11 @@ def test_bitmaskedarray(): def test_recordarray(): - ak.deprecations_as_errors = False - - assert ak.from_arrayset( - *ak.to_arrayset([(1.1, [1]), (2.2, [1, 2]), (3.3, [1, 2, 3])]) + assert ak.from_buffers( + *ak.to_buffers([(1.1, [1]), (2.2, [1, 2]), (3.3, [1, 2, 3])]) ).tolist() == [(1.1, [1]), (2.2, [1, 2]), (3.3, [1, 2, 3])] - assert ak.from_arrayset( - *ak.to_arrayset( + assert ak.from_buffers( + *ak.to_buffers( [{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}, {"x": 3.3, "y": [1, 2, 3]}] ) ).tolist() == [ @@ -207,8 +189,6 @@ def test_recordarray(): def test_record(): - ak.deprecations_as_errors = False - assert pickle.loads( pickle.dumps(ak.Record({"x": 2.2, "y": [1, 2]}), -1) ).tolist() == {"x": 2.2, "y": [1, 2]} @@ -227,11 +207,9 @@ def test_record(): def test_regulararray(): - ak.deprecations_as_errors = False - content = ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]).layout regulararray = ak.layout.RegularArray(content, 3, zeros_length=0) - assert ak.from_arrayset(*ak.to_arrayset(regulararray)).tolist() == [ + assert ak.from_buffers(*ak.to_buffers(regulararray)).tolist() == [ [1, 2, 3], [4, 5, 6], [7, 8, 9], @@ -246,9 +224,7 @@ def test_regulararray(): def test_unionarray(): - ak.deprecations_as_errors = False - - assert ak.from_arrayset(*ak.to_arrayset([[1, 2, 3], [], 4, 5])).tolist() == [ + assert ak.from_buffers(*ak.to_buffers([[1, 2, 3], [], 4, 5])).tolist() == [ [1, 2, 3], [], 4, @@ -263,11 +239,9 @@ def test_unionarray(): def test_unmaskedarray(): - ak.deprecations_as_errors = False - content = ak.Array([1, 2, 3, 4, 5]).layout unmaskedarray = ak.layout.UnmaskedArray(content) - assert ak.from_arrayset(*ak.to_arrayset(unmaskedarray)).tolist() == [1, 2, 3, 4, 5] + assert ak.from_buffers(*ak.to_buffers(unmaskedarray)).tolist() == [1, 2, 3, 4, 5] assert pickle.loads(pickle.dumps(ak.Array(unmaskedarray), -1)).tolist() == [ 1, 2, @@ -278,28 +252,47 @@ def test_unmaskedarray(): def test_partitioned(): - ak.deprecations_as_errors = False - array = ak.repartition(ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), 3) - form, container, num_partitions = ak.to_arrayset(array, partition_first=True) - assert ak.from_arrayset( - form, container, num_partitions, partition_first=True - ).tolist() == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + form, length, container = ak.to_buffers(array) + assert ak.from_buffers(form, length, container).tolist() == [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ] - form, container, num_partitions = ak.to_arrayset(array, partition_first=False) - assert ak.from_arrayset( - form, container, num_partitions, partition_first=False - ).tolist() == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + form, length, container = ak.to_buffers(array) + assert ak.from_buffers(form, length, container).tolist() == [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ] one = ak.Array([1, 2, 3, 4, 5]) two = ak.Array([6, 7, 8, 9, 10]) container = {} - form1, _, _ = ak.to_arrayset(one, container, 0) - form2, _, _ = ak.to_arrayset(two, container, 1) + lengths = [] + form1, length, _ = ak.to_buffers(one, container, 0) + lengths.append(length) + form2, length, _ = ak.to_buffers(two, container, 1) + lengths.append(length) assert form1 == form2 - assert ak.from_arrayset(form1, container, 2).tolist() == [ + assert ak.from_buffers(form1, lengths, container).tolist() == [ 1, 2, 3, @@ -326,24 +319,35 @@ def test_partitioned(): def test_lazy(): - ak.deprecations_as_errors = False - array = ak.Array([1, 2, 3, 4, 5]) - form, container, num_partitions = ak.to_arrayset(array) + form, length, container = ak.to_buffers(array) - assert ak.from_arrayset( - form, container, num_partitions, lazy=True, lazy_lengths=5 - ).tolist() == [1, 2, 3, 4, 5] + assert ak.from_buffers(form, length, container, lazy=True).tolist() == [ + 1, + 2, + 3, + 4, + 5, + ] def test_lazy_partitioned(): ak.deprecations_as_errors = False array = ak.repartition(ak.Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), 3) - form, container, num_partitions = ak.to_arrayset(array) - assert num_partitions == 4 + form, length, container = ak.to_buffers(array) + assert length == [3, 3, 3, 1] - assert ak.from_arrayset( - form, container, num_partitions, lazy=True, lazy_lengths=[3, 3, 3, 1] - ).tolist() == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + assert ak.from_buffers(form, length, container, lazy=True).tolist() == [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ] diff --git a/tests/test_0384-lazy-arrayset.py b/tests/test_0384-lazy-arrayset.py index beefa51456..c13ec64662 100644 --- a/tests/test_0384-lazy-arrayset.py +++ b/tests/test_0384-lazy-arrayset.py @@ -21,9 +21,7 @@ def __setitem__(self, key, value): return super(Canary, self).__setitem__(key, value) -def test_lazy_arrayset(): - ak.deprecations_as_errors = False - +def test_lazy_buffers(): array = ak.from_json( """ [ @@ -67,19 +65,19 @@ def test_lazy_arrayset(): ) canary = Canary() - prefix = "kitty" - form, container, npart = ak.to_arrayset(array, container=canary, prefix=prefix) + key_format = "kitty-{form_key}-{attribute}" + form, length, container = ak.to_buffers(array, container=canary, key_format=key_format) assert not any(op[0] == "get" for op in canary.ops) canary.ops = [] cache = {} - out = ak.from_arrayset( + out = ak.from_buffers( form, + length, container, + key_format=key_format, lazy=True, lazy_cache=cache, - lazy_lengths=3, - prefix=prefix, lazy_cache_key="hello", ) assert len(canary.ops) == 0 @@ -104,8 +102,8 @@ def test_lazy_arrayset(): ("get", "kitty-node11-tags"), ("get", "kitty-node11-index"), ("get", "kitty-node14-offsets"), - ("get", "kitty-node13"), - ("get", "kitty-node16"), + ("get", "kitty-node13-data"), + ("get", "kitty-node16-data"), } assert set(cache) == { "hello", @@ -117,15 +115,13 @@ def test_lazy_arrayset(): cache.clear() assert ak.to_list(out.masked) == [None, 4, 4] - assert set(canary.ops) == {("get", "kitty-node17-index"), ("get", "kitty-node18")} + assert set(canary.ops) == {("get", "kitty-node17-index"), ("get", "kitty-node18-data")} assert set(cache) == {"hello", "hello(kitty-node17-virtual)"} canary.ops = [] cache.clear() def test_longer_than_expected(): - ak.deprecations_as_errors = False - array = ak.Array( ak.layout.ListOffsetArray64( ak.layout.Index64([0, 2, 4]), @@ -137,7 +133,7 @@ def test_longer_than_expected(): ), ) ) - out = ak.from_arrayset(*ak.to_arrayset(array), lazy=True, lazy_lengths=2) + out = ak.from_buffers(*ak.to_buffers(array), lazy=True) assert ak.to_list(out) == [ [{"item1": 0, "longitem": 0}, {"item1": 1, "longitem": 1}], [{"item1": 2, "longitem": 2}, {"item1": 3, "longitem": 3}], From 3ffe3ec8a084decda6ed410a172cd21ec8fc49db Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 11 Dec 2020 14:13:45 -0600 Subject: [PATCH 7/8] Updated the documentation, too. --- docs-src/_toc.yml | 4 +- ...-arrayset.md => how-to-convert-buffers.md} | 74 +++++++------------ docs-src/how-to-convert.md | 2 +- 3 files changed, 31 insertions(+), 49 deletions(-) rename docs-src/{how-to-convert-arrayset.md => how-to-convert-buffers.md} (59%) diff --git a/docs-src/_toc.yml b/docs-src/_toc.yml index cff21ffcdc..e05f4b5b5c 100644 --- a/docs-src/_toc.yml +++ b/docs-src/_toc.yml @@ -20,8 +20,8 @@ title: "Arrow and Parquet" - file: how-to-convert-pandas title: "Pandas" - - file: how-to-convert-arrayset - title: "Generic array-sets" + - file: how-to-convert-buffers + title: "Generic buffers" - file: how-to-create title: "Creating new arrays" diff --git a/docs-src/how-to-convert-arrayset.md b/docs-src/how-to-convert-buffers.md similarity index 59% rename from docs-src/how-to-convert-arrayset.md rename to docs-src/how-to-convert-buffers.md index d94730df0c..f34d93cae9 100644 --- a/docs-src/how-to-convert-arrayset.md +++ b/docs-src/how-to-convert-buffers.md @@ -11,10 +11,10 @@ kernelspec: name: python3 --- -Generic array-sets -================== +Generic buffers +=============== -Most of the conversion functions target a particular library: NumPy, Arrow, Pandas, or Python itself. As a catch-all for other storage formats, Awkward Arrays can be converted to and from "array-sets," sets of named arrays with a schema that can be used to reconstruct the original array. This section will demonstrate how an array-set can be used to store an Awkward Array in an HDF5 file, which ordinarily wouldn't be able to represent nested, irregular data structures. +Most of the conversion functions target a particular library: NumPy, Arrow, Pandas, or Python itself. As a catch-all for other storage formats, Awkward Arrays can be converted to and from sets of named buffers. The buffers are not (usually) intelligible on their own; the length of the array and a JSON document are needed to reconstitute the original structure. This section will demonstrate how an array-set can be used to store an Awkward Array in an HDF5 file, which ordinarily wouldn't be able to represent nested, irregular data structures. ```{code-cell} ipython3 import awkward as ak @@ -23,8 +23,8 @@ import h5py import json ``` -From Awkward to an array-set ----------------------------- +From Awkward to buffers +----------------------- Consider the following complex array: @@ -37,18 +37,17 @@ ak_array = ak.Array([ ak_array ``` -The [ak.to_arrayset](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_arrayset.html) function decomposes it into a set of one-dimensional arrays (a zero-copy operation). +The [ak.to_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html) function decomposes it into a set of one-dimensional arrays (a zero-copy operation). ```{code-cell} ipython3 -form, container, num_partitions = ak.to_arrayset(ak_array) +form, length, container = ak.to_buffers(ak_array) ``` The pieces needed to reconstitute this array are: * the [Form](https://awkward-array.readthedocs.io/en/latest/ak.forms.Form.html), which defines how structure is built from one-dimensional arrays, - * the one-dimensional arrays in the `container` (a [MutableMapping](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes)), - * the number of partitions, if any, - * the length of the original array or lengths of all partitions ([ak.partitions](https://awkward-array.readthedocs.io/en/latest/_auto/ak.partitions.html)) are needed if we wish to read it back _lazily_ (more on that below). + * the length of the original array or lengths of all of its partitions ([ak.partitions](https://awkward-array.readthedocs.io/en/latest/_auto/ak.partitions.html)), + * the one-dimensional arrays in the `container` (a [MutableMapping](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes)). The [Form](https://awkward-array.readthedocs.io/en/latest/ak.forms.Form.html) is like an Awkward [Type](https://awkward-array.readthedocs.io/en/latest/ak.types.Type.html) in that it describes how the data are structured, but with more detail: it includes distinctions such as the difference between [ListArray](https://awkward-array.readthedocs.io/en/latest/ak.layout.ListArray.html) and [ListOffsetArray](https://awkward-array.readthedocs.io/en/latest/ak.layout.ListOffsetArray.html), as well as the integer types of structural [Indexes](https://awkward-array.readthedocs.io/en/latest/ak.layout.Index.html). @@ -58,37 +57,31 @@ It is usually presented as JSON, and has a compact JSON format (when [Form.tojso form ``` -This `container` is a new dict, but it could have been a user-specified [MutableMapping](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes). +In this case, the `length` is just an integer. It would be a list of integers if `ak_array` was partitioned. ```{code-cell} ipython3 -container +length ``` -This array has no partitions. +This `container` is a new dict, but it could have been a user-specified [MutableMapping](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes) if passed into [ak.to_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html) as an argument. ```{code-cell} ipython3 -num_partitions is None -``` - -This is also what we find from [ak.partitions](https://awkward-array.readthedocs.io/en/latest/_auto/ak.partitions.html). - -```{code-cell} ipython3 -ak.partitions(ak_array) is None +container ``` -From array-set to Awkward -------------------------- +From buffers to Awkward +----------------------- -The function that reverses [ak.to_arrayset](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_arrayset.html) is [ak.from_arrayset](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_arrayset.html). Its first three arguments are `form`, `container`, and `num_partitions`. +The function that reverses [ak.to_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html) is [ak.from_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_buffers.html). Its first three arguments are `form`, `length`, and `container`. ```{code-cell} ipython3 -ak.from_arrayset(form, container, num_partitions) +ak.from_buffers(form, length, container) ``` Saving Awkward Arrays to HDF5 ----------------------------- -The [h5py](https://www.h5py.org/) library presents each group in an HDF5 file as a [MutableMapping](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes), which we can use as a container for an array-set. We must also save the `form`, `num_partitions`, and `length` as metadata for the array to be retrievable. +The [h5py](https://www.h5py.org/) library presents each group in an HDF5 file as a [MutableMapping](https://docs.python.org/3/library/collections.abc.html#collections-abstract-base-classes), which we can use as a container for an array-set. We must also save the `form` and `length` as metadata for the array to be retrievable. ```{code-cell} ipython3 file = h5py.File("/tmp/example.hdf5", "w") @@ -96,10 +89,10 @@ group = file.create_group("awkward") group ``` -We can fill this `group` as a `container` by passing it in to [ak.to_arrayset](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_arrayset.html). +We can fill this `group` as a `container` by passing it in to [ak.to_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html). ```{code-cell} ipython3 -form, container, num_partitions = ak.to_arrayset(ak_array, container=group) +form, length, container = ak.to_buffers(ak_array, container=group) ``` ```{code-cell} ipython3 @@ -115,7 +108,7 @@ container.keys() Here's one. ```{code-cell} ipython3 -np.asarray(container["node0-offsets"]) +np.asarray(container["part0-node0-offsets"]) ``` Now we need to add the other information to the group as metadata. Since HDF5 accepts string-valued metadata, we can put it all in as JSON or numbers. @@ -126,38 +119,27 @@ group.attrs["form"] ``` ```{code-cell} ipython3 -group.attrs["num_partitions"] = json.dumps(num_partitions) -group.attrs["num_partitions"] -``` - -```{code-cell} ipython3 -group.attrs["partition_lengths"] = json.dumps(ak.partitions(ak_array)) -group.attrs["partition_lengths"] -``` - -```{code-cell} ipython3 -group.attrs["length"] = len(ak_array) +group.attrs["length"] = json.dumps(length) # JSON-encode it because it might be a list group.attrs["length"] ``` Reading Awkward Arrays from HDF5 -------------------------------- -With that, we can reconstitute the array by supplying [ak.from_arrayset](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_arrayset.html) the right arguments from the group and metadata. +With that, we can reconstitute the array by supplying [ak.from_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_buffers.html) the right arguments from the group and metadata. The group can't be used as a `container` as-is, since subscripting it returns `h5py.Dataset` objects, rather than arrays. ```{code-cell} ipython3 -reconstituted = ak.from_arrayset( +reconstituted = ak.from_buffers( ak.forms.Form.fromjson(group.attrs["form"]), + json.loads(group.attrs["length"]), {k: np.asarray(v) for k, v in group.items()}, ) reconstituted ``` -Like [ak.from_parquet](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_parquet.html), [ak.from_arrayset](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_arrayset.html) has the option to read lazily, only accessing record fields and partitions that are accessed. - -To do so, we need to pass `lazy=True`, but also the total length of the array (if not partitioned) or the lengths of all the partitions (if partitioned). +Like [ak.from_parquet](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_parquet.html), [ak.from_buffers](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_buffers.html) has the option to read lazily, only accessing record fields and partitions that are accessed. ```{code-cell} ipython3 class LazyGet: @@ -168,11 +150,11 @@ class LazyGet: print(key) return np.asarray(self.group[key]) -lazy = ak.from_arrayset( +lazy = ak.from_buffers( ak.forms.Form.fromjson(group.attrs["form"]), + json.loads(group.attrs["length"]), LazyGet(group), lazy=True, - lazy_lengths = group.attrs["length"], ) ``` diff --git a/docs-src/how-to-convert.md b/docs-src/how-to-convert.md index 02610f36de..62b3a13719 100644 --- a/docs-src/how-to-convert.md +++ b/docs-src/how-to-convert.md @@ -20,4 +20,4 @@ Converting arrays * **[ROOT via Uproot](how-to-convert-uproot)** * **[Arrow and Parquet](how-to-convert-arrow)** * **[Pandas](how-to-convert-pandas)** - * **[Generic array-sets](how-to-convert-arrayset)** + * **[Generic array-sets](how-to-convert-buffers)** From f068e1d4204d43380febf4a8cde2980f643c0b2b Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 11 Dec 2020 14:58:35 -0600 Subject: [PATCH 8/8] Last touches: length sanity-checks at all levels. --- src/awkward/operations/convert.py | 100 +++++++++++++++++++++++--- src/libawkward/array/RegularArray.cpp | 4 ++ tests/test_0384-lazy-arrayset.py | 9 ++- 3 files changed, 102 insertions(+), 11 deletions(-) diff --git a/src/awkward/operations/convert.py b/src/awkward/operations/convert.py index 0f355051fa..5454ae8c8b 100644 --- a/src/awkward/operations/convert.py +++ b/src/awkward/operations/convert.py @@ -3193,16 +3193,19 @@ def _form_to_layout( container, partnum, key_format, - len(mask), + length, lazy_cache, lazy_cache_key, ) + if length is None: + length = len(content) + return ak.layout.BitMaskedArray( mask, content, form.valid_when, - len(content), + length, form.lsb_order, identities, parameters, @@ -3218,12 +3221,22 @@ def _form_to_layout( raw_mask.view(_index_form_to_dtype[form.mask]) ) + if length is None: + length = len(mask) + elif length != len(mask): + raise ValueError( + "ByteMaskedArray length mismatch: expected {0}, observed {1}".format( + length, len(mask) + ) + + ak._util.exception_suffix(__file__) + ) + content = _form_to_layout( form.content, container, partnum, key_format, - len(mask), + length, lazy_cache, lazy_cache_key, ) @@ -3233,6 +3246,13 @@ def _form_to_layout( ) elif isinstance(form, ak.forms.EmptyForm): + if length is not None and length != 0: + raise ValueError( + "EmptyArray length mismatch: expected {0}, observed {1}".format( + length, 0 + ) + + ak._util.exception_suffix(__file__) + ) return ak.layout.EmptyArray(identities, parameters) elif isinstance(form, ak.forms.IndexedForm): @@ -3245,6 +3265,16 @@ def _form_to_layout( raw_index.view(_index_form_to_dtype[form.index]) ) + if length is None: + length = len(index) + elif length != len(index): + raise ValueError( + "IndexedArray length mismatch: expected {0}, observed {1}".format( + length, len(index) + ) + + ak._util.exception_suffix(__file__) + ) + content = _form_to_layout( form.content, container, @@ -3269,6 +3299,16 @@ def _form_to_layout( raw_index.view(_index_form_to_dtype[form.index]) ) + if length is None: + length = len(index) + elif length != len(index): + raise ValueError( + "IndexedOptionArray length mismatch: expected {0}, observed {1}".format( + length, len(index) + ) + + ak._util.exception_suffix(__file__) + ) + content = _form_to_layout( form.content, container, @@ -3301,6 +3341,16 @@ def _form_to_layout( raw_stops.view(_index_form_to_dtype[form.stops]) ) + if length is None: + length = len(starts) + elif length != len(starts): + raise ValueError( + "ListArray length mismatch: expected {0}, observed {1}".format( + length, len(starts) + ) + + ak._util.exception_suffix(__file__) + ) + array_starts = numpy.asarray(starts) array_stops = numpy.asarray(stops)[: len(array_starts)] array_stops = array_stops[array_starts != array_stops] @@ -3328,6 +3378,16 @@ def _form_to_layout( raw_offsets.view(_index_form_to_dtype[form.offsets]) ) + if length is None: + length = len(offsets) - 1 + elif length != len(offsets) - 1: + raise ValueError( + "ListOffsetArray length mismatch: expected {0}, observed {1}".format( + length, len(offsets) - 1 + ) + + ak._util.exception_suffix(__file__) + ) + content = _form_to_layout( form.content, container, @@ -3354,7 +3414,11 @@ def _form_to_layout( dtype, inner_shape = dtype_inner_shape, () else: dtype, inner_shape = dtype_inner_shape.subdtype - shape = (-1,) + inner_shape + + if length is None: + shape = (-1,) + inner_shape + else: + shape = (length,) + inner_shape array = raw_array.view(dtype).reshape(shape) @@ -3386,6 +3450,14 @@ def _form_to_layout( if length is None: length = minlength + elif minlength is not None and length > minlength: + raise ValueError( + "RecordArray length mismatch: expected {0}, minimum content is {1}".format( + length, minlength + ) + + ak._util.exception_suffix(__file__) + ) + return ak.layout.RecordArray( contents, None if form.istuple else keys, length, identities, parameters, ) @@ -3426,6 +3498,16 @@ def _form_to_layout( raw_index.view(_index_form_to_dtype[form.index]) ) + if length is None: + length = len(tags) + elif length != len(tags): + raise ValueError( + "UnionArray length mismatch: expected {0}, observed {1}".format( + length, len(tags) + ) + + ak._util.exception_suffix(__file__) + ) + contents = [] for i, content_form in enumerate(form.contents): mine = numpy.array(index)[numpy.equal(tags, i)] @@ -3635,7 +3717,7 @@ def kf(**v): + ak._util.exception_suffix(__file__) ), "1.1.0", - "January 1, 2021", + "February 1, 2021", ) args = (form, container, str(partition_start), key_format, length) @@ -3736,7 +3818,7 @@ def to_arrayset( is sorted or lookup performance depends on alphabetical order. **Deprecated:** This will be removed in `awkward>=1.1.0` (target date: - January 1, 2021). Use #ak.to_buffers instead: the arguments and return + February 1, 2021). Use #ak.to_buffers instead: the arguments and return values have changed. Decomposes an Awkward Array into a Form and a collection of arrays, so @@ -3845,7 +3927,7 @@ def to_arrayset( + ak._util.exception_suffix(__file__) ), "1.1.0", - "January 1, 2021", + "February 1, 2021", ) layout = to_layout(array, allow_record=False, allow_other=False) @@ -3980,7 +4062,7 @@ def from_arrayset( high-level. **Deprecated:** This will be removed in `awkward>=1.1.0` (target date: - January 1, 2021). Use #ak.from_buffers instead: the arguments have changed. + February 1, 2021). Use #ak.from_buffers instead: the arguments have changed. Reconstructs an Awkward Array from a Form and a collection of arrays, so that data can be losslessly read from file formats and storage devices that @@ -4019,7 +4101,7 @@ def from_arrayset( + ak._util.exception_suffix(__file__) ), "1.1.0", - "January 1, 2021", + "February 1, 2021", ) if num_partitions is None: diff --git a/src/libawkward/array/RegularArray.cpp b/src/libawkward/array/RegularArray.cpp index a9f1bef99f..c63aee7b78 100644 --- a/src/libawkward/array/RegularArray.cpp +++ b/src/libawkward/array/RegularArray.cpp @@ -474,6 +474,10 @@ namespace awkward { std::stringstream out; out << indent << pre << "<" << classname() << " size=\"" << size_ << "\">\n"; + if (size_ == 0) { + out << indent << pre << "<" << classname() << " length=\"" << length_ + << "\">\n"; + } if (identities_.get() != nullptr) { out << identities_.get()->tostring_part( indent + std::string(" "), "", "\n"); diff --git a/tests/test_0384-lazy-arrayset.py b/tests/test_0384-lazy-arrayset.py index c13ec64662..0cfccd86a9 100644 --- a/tests/test_0384-lazy-arrayset.py +++ b/tests/test_0384-lazy-arrayset.py @@ -66,7 +66,9 @@ def test_lazy_buffers(): canary = Canary() key_format = "kitty-{form_key}-{attribute}" - form, length, container = ak.to_buffers(array, container=canary, key_format=key_format) + form, length, container = ak.to_buffers( + array, container=canary, key_format=key_format + ) assert not any(op[0] == "get" for op in canary.ops) canary.ops = [] @@ -115,7 +117,10 @@ def test_lazy_buffers(): cache.clear() assert ak.to_list(out.masked) == [None, 4, 4] - assert set(canary.ops) == {("get", "kitty-node17-index"), ("get", "kitty-node18-data")} + assert set(canary.ops) == { + ("get", "kitty-node17-index"), + ("get", "kitty-node18-data"), + } assert set(cache) == {"hello", "hello(kitty-node17-virtual)"} canary.ops = [] cache.clear()