Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ refactoring: ak.broadcast_arrays #1368

Merged
merged 3 commits into from
Mar 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
284 changes: 143 additions & 141 deletions src/awkward/_v2/operations/structure/ak_broadcast_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,144 +7,146 @@

# @ak._v2._connect.numpy.implements("broadcast_arrays")
def broadcast_arrays(*arrays, **kwargs):
raise ak._v2._util.error(NotImplementedError)


# """
# Args:
# arrays: Arrays to broadcast into the same structure.
# left_broadcast (bool): If True, follow rules for implicit
# left-broadcasting, as described below.
# right_broadcast (bool): If True, follow rules for implicit
# right-broadcasting, as described below.
# highlevel (bool, default is True): If True, return an #ak.Array;
# otherwise, return a low-level #ak.layout.Content subclass.

# Like NumPy's
# [broadcast_arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.broadcast_arrays.html)
# function, this function returns the input `arrays` with enough elements
# duplicated that they can be combined element-by-element.

# For NumPy arrays, this means that scalars are replaced with arrays with
# the same scalar value repeated at every element of the array, and regular
# dimensions are created to increase low-dimensional data into
# high-dimensional data.

# For example,

# >>> ak.broadcast_arrays(5,
# ... [1, 2, 3, 4, 5])
# [<Array [5, 5, 5, 5, 5] type='5 * int64'>,
# <Array [1, 2, 3, 4, 5] type='5 * int64'>]

# and

# >>> ak.broadcast_arrays(np.array([1, 2, 3]),
# ... np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
# [<Array [[ 1, 2, 3], [ 1, 2, 3]] type='2 * 3 * int64'>,
# <Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]

# Note that in the second example, when the `3 * int64` array is expanded
# to match the `2 * 3 * float64` array, it is the deepest dimension that
# is aligned. If we try to match a `2 * int64` with the `2 * 3 * float64`,

# >>> ak.broadcast_arrays(np.array([1, 2]),
# ... np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
# ValueError: cannot broadcast RegularArray of size 2 with RegularArray of size 3

# NumPy has the same behavior: arrays with different numbers of dimensions
# are aligned to the right before expansion. One can control this by
# explicitly adding a new axis (reshape to add a dimension of length 1)
# where the expansion is supposed to take place because a dimension of
# length 1 can be expanded like a scalar.

# >>> ak.broadcast_arrays(np.array([1, 2])[:, np.newaxis],
# ... np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
# [<Array [[ 1, 1, 1], [ 2, 2, 2]] type='2 * 3 * int64'>,
# <Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]

# Again, NumPy does the same thing (`np.newaxis` is equal to None, so this
# trick is often shown with None in the slice-tuple). Where the broadcasting
# happens can be controlled, but numbers of dimensions that don't match are
# implicitly aligned to the right (fitting innermost structure, not
# outermost).

# While that might be an arbitrary decision for rectilinear arrays, it is
# much more natural for implicit broadcasting to align left for tree-like
# structures. That is, the root of each data structure should agree and
# leaves may be duplicated to match. For example,

# >>> ak.broadcast_arrays([ 100, 200, 300],
# ... [[1.1, 2.2, 3.3], [], [4.4, 5.5]])
# [<Array [[100, 100, 100], [], [300, 300]] type='3 * var * int64'>,
# <Array [[1.1, 2.2, 3.3], [], [4.4, 5.5]] type='3 * var * float64'>]

# One typically wants single-item-per-element data to be duplicated to
# match multiple-items-per-element data. Operations on the broadcasted
# arrays like

# one_dimensional + nested_lists

# would then have the same effect as the procedural code

# for x, outer in zip(one_dimensional, nested_lists):
# output = []
# for inner in outer:
# output.append(x + inner)
# yield output

# where `x` has the same value for each `inner` in the inner loop.

# Awkward Array's broadcasting manages to have it both ways by applying the
# following rules:

# * If all dimensions are regular (i.e. #ak.types.RegularType), like NumPy,
# implicit broadcasting aligns to the right, like NumPy.
# * If any dimension is variable (i.e. #ak.types.ListType), which can
# never be true of NumPy, implicit broadcasting aligns to the left.
# * Explicit broadcasting with a length-1 regular dimension always
# broadcasts, like NumPy.

# Thus, it is important to be aware of the distinction between a dimension
# that is declared to be regular in the type specification and a dimension
# that is allowed to be variable (even if it happens to have the same length
# for all elements). This distinction is can be accessed through the
# #ak.Array.type, but it is lost when converting an array into JSON or
# Python objects.
# """
# (highlevel, left_broadcast, right_broadcast) = ak._v2._util.extra(
# (),
# kwargs,
# [("highlevel", True), ("left_broadcast", True), ("right_broadcast", True)],
# )

# inputs = []
# for x in arrays:
# y = ak._v2.operations.convert.to_layout(x, allow_record=True, allow_other=True)
# if isinstance(y, ak.partition.PartitionedArray): # NO PARTITIONED ARRAY
# y = y.toContent()
# if not isinstance(y, (ak._v2.contents.Content, ak._v2.contents.Record)):
# y = ak._v2.contents.NumpyArray(ak.nplike.of(*arrays).array([y]))
# inputs.append(y)

# def getfunction(inputs):
# if all(isinstance(x, ak._v2.contents.NumpyArray) for x in inputs):
# return lambda: tuple(inputs)
# else:
# return None

# behavior = ak._v2._util.behaviorof(*arrays)
# out = ak._v2._util.broadcast_and_apply(
# inputs,
# getfunction,
# behavior,
# left_broadcast=left_broadcast,
# right_broadcast=right_broadcast,
# pass_depth=False,
# numpy_to_regular=True,
# )
# assert isinstance(out, tuple)
# if highlevel:
# return [ak._v2._util.wrap(x, behavior) for x in out]
# else:
# return list(out)
"""
Args:
arrays: Arrays to broadcast into the same structure.
left_broadcast (bool): If True, follow rules for implicit
left-broadcasting, as described below.
right_broadcast (bool): If True, follow rules for implicit
right-broadcasting, as described below.
highlevel (bool, default is True): If True, return an #ak.Array;
otherwise, return a low-level #ak.layout.Content subclass.

Like NumPy's
[broadcast_arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.broadcast_arrays.html)
function, this function returns the input `arrays` with enough elements
duplicated that they can be combined element-by-element.

For NumPy arrays, this means that scalars are replaced with arrays with
the same scalar value repeated at every element of the array, and regular
dimensions are created to increase low-dimensional data into
high-dimensional data.

For example,

>>> ak.broadcast_arrays(5,
... [1, 2, 3, 4, 5])
[<Array [5, 5, 5, 5, 5] type='5 * int64'>,
<Array [1, 2, 3, 4, 5] type='5 * int64'>]

and

>>> ak.broadcast_arrays(np.array([1, 2, 3]),
... np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
[<Array [[ 1, 2, 3], [ 1, 2, 3]] type='2 * 3 * int64'>,
<Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]

Note that in the second example, when the `3 * int64` array is expanded
to match the `2 * 3 * float64` array, it is the deepest dimension that
is aligned. If we try to match a `2 * int64` with the `2 * 3 * float64`,

>>> ak.broadcast_arrays(np.array([1, 2]),
... np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
ValueError: cannot broadcast RegularArray of size 2 with RegularArray of size 3

NumPy has the same behavior: arrays with different numbers of dimensions
are aligned to the right before expansion. One can control this by
explicitly adding a new axis (reshape to add a dimension of length 1)
where the expansion is supposed to take place because a dimension of
length 1 can be expanded like a scalar.

>>> ak.broadcast_arrays(np.array([1, 2])[:, np.newaxis],
... np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
[<Array [[ 1, 1, 1], [ 2, 2, 2]] type='2 * 3 * int64'>,
<Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]

Again, NumPy does the same thing (`np.newaxis` is equal to None, so this
trick is often shown with None in the slice-tuple). Where the broadcasting
happens can be controlled, but numbers of dimensions that don't match are
implicitly aligned to the right (fitting innermost structure, not
outermost).

While that might be an arbitrary decision for rectilinear arrays, it is
much more natural for implicit broadcasting to align left for tree-like
structures. That is, the root of each data structure should agree and
leaves may be duplicated to match. For example,

>>> ak.broadcast_arrays([ 100, 200, 300],
... [[1.1, 2.2, 3.3], [], [4.4, 5.5]])
[<Array [[100, 100, 100], [], [300, 300]] type='3 * var * int64'>,
<Array [[1.1, 2.2, 3.3], [], [4.4, 5.5]] type='3 * var * float64'>]

One typically wants single-item-per-element data to be duplicated to
match multiple-items-per-element data. Operations on the broadcasted
arrays like

one_dimensional + nested_lists

would then have the same effect as the procedural code

for x, outer in zip(one_dimensional, nested_lists):
output = []
for inner in outer:
output.append(x + inner)
yield output

where `x` has the same value for each `inner` in the inner loop.

Awkward Array's broadcasting manages to have it both ways by applying the
following rules:

* If all dimensions are regular (i.e. #ak.types.RegularType), like NumPy,
implicit broadcasting aligns to the right, like NumPy.
* If any dimension is variable (i.e. #ak.types.ListType), which can
never be true of NumPy, implicit broadcasting aligns to the left.
* Explicit broadcasting with a length-1 regular dimension always
broadcasts, like NumPy.

Thus, it is important to be aware of the distinction between a dimension
that is declared to be regular in the type specification and a dimension
that is allowed to be variable (even if it happens to have the same length
for all elements). This distinction is can be accessed through the
#ak.Array.type, but it is lost when converting an array into JSON or
Python objects.
"""
with ak._v2._util.OperationErrorContext(
"ak._v2.broadcast_arrays",
dict(arrays=arrays, kwargs=kwargs),
):
return _impl(arrays, kwargs)


def _impl(arrays, kwargs):
(highlevel, left_broadcast, right_broadcast) = ak._v2._util.extra(
(),
kwargs,
[("highlevel", True), ("left_broadcast", True), ("right_broadcast", True)],
)

inputs = []
for x in arrays:
y = ak._v2.operations.convert.to_layout(x, allow_record=True, allow_other=True)
if not isinstance(y, (ak._v2.contents.Content, ak._v2.Record)):
y = ak._v2.contents.NumpyArray(ak.nplike.of(*arrays).array([y]))
inputs.append(y)

def action(inputs, **kwargs):
if all(isinstance(x, ak._v2.contents.NumpyArray) for x in inputs):
return tuple(inputs)
else:
return None

behavior = ak._v2._util.behavior_of(*arrays)
out = ak._v2._broadcasting.broadcast_and_apply(
inputs,
action,
behavior,
left_broadcast=left_broadcast,
right_broadcast=right_broadcast,
numpy_to_regular=True,
)
assert isinstance(out, tuple)
if highlevel:
return [ak._v2._util.wrap(x, behavior) for x in out]
else:
return list(out)
36 changes: 36 additions & 0 deletions tests/v2/test_0119-numexpr-and-broadcast-arrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE


import pytest # noqa: F401
import numpy as np # noqa: F401
import awkward as ak # noqa: F401

to_list = ak._v2.operations.convert.to_list


def test_broadcast_arrays():
a = ak._v2.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]], check_valid=True)
b = ak._v2.Array([100, 200, 300], check_valid=True)

out = ak._v2.operations.structure.broadcast_arrays(a, b)
assert to_list(out[0]) == [[1.1, 2.2, 3.3], [], [4.4, 5.5]]
assert to_list(out[1]) == [[100, 100, 100], [], [300, 300]]


numexpr = pytest.importorskip("numexpr")


@pytest.mark.skip(reason="FIXME: ak.numexpr")
def test_numexpr():
# NumExpr's interface pulls variables from the surrounding scope,
# so these F841 "unused variables" actually are used.

a = ak._v2.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]], check_valid=True) # noqa: F841
b = ak._v2.Array([100, 200, 300], check_valid=True) # noqa: F841
assert to_list(ak.numexpr.evaluate("a + b")) == [
[101.1, 102.2, 103.3],
[],
[304.4, 305.5],
]
a = [1, 2, 3] # noqa: F841
assert to_list(ak.numexpr.re_evaluate()) == [101, 202, 303]
3 changes: 0 additions & 3 deletions tests/v2/test_0334-fully-broadcastable-where.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ def test():
]


@pytest.mark.skip(
reason="ak._v2.operations.structure.broadcast_arrays to be implemented."
)
def test_issue_334():
a = ak._v2.highlevel.Array([1, 2, 3, 4])
b = ak._v2.highlevel.Array([-1])
Expand Down
4 changes: 3 additions & 1 deletion tests/v2/test_0806-empty-lists-cartesian-fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def test_cartesian():
)
assert to_list(result) == [None, None]

one, two = ak.broadcast_arrays(candidate, ak._v2.Array([[1, 2, 3], []]))
one, two = ak._v2.operations.structure.broadcast_arrays(
candidate, ak._v2.Array([[1, 2, 3], []])
)
assert to_list(one) == [None, None]
assert to_list(two) == [None, None]
19 changes: 19 additions & 0 deletions tests/v2/test_1017-numpyarray-broadcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE


import pytest # noqa: F401
import numpy as np # noqa: F401
import awkward as ak # noqa: F401


def test():
x = ak._v2.contents.ListOffsetArray(
ak._v2.index.Index64(np.array([0, 2, 2, 4, 6])),
ak._v2.contents.NumpyArray(np.arange(8 * 8).reshape(8, -1)),
)
y = ak._v2.contents.ListOffsetArray(
ak._v2.index.Index64(np.array([0, 2, 2, 4, 6])),
ak._v2.contents.NumpyArray(np.arange(8)),
)
u, v = ak._v2.operations.structure.broadcast_arrays(x, y)
assert u.ndim == v.ndim