C++ refactoring: ak.broadcast_arrays (#1368)

* ak.broadcast_arrays and testing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Moved numexpr importorskip Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
scikit-hep · Mar 15, 2022 · 922b727 · 922b727
1 parent cd661ec
commit 922b727
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 145 deletions.
diff --git a/src/awkward/_v2/operations/structure/ak_broadcast_arrays.py b/src/awkward/_v2/operations/structure/ak_broadcast_arrays.py
@@ -7,144 +7,146 @@
 
 # @ak._v2._connect.numpy.implements("broadcast_arrays")
 def broadcast_arrays(*arrays, **kwargs):
-    raise ak._v2._util.error(NotImplementedError)
-
-
-#     """
-#     Args:
-#         arrays: Arrays to broadcast into the same structure.
-#         left_broadcast (bool): If True, follow rules for implicit
-#             left-broadcasting, as described below.
-#         right_broadcast (bool): If True, follow rules for implicit
-#             right-broadcasting, as described below.
-#         highlevel (bool, default is True): If True, return an #ak.Array;
-#             otherwise, return a low-level #ak.layout.Content subclass.
-
-#     Like NumPy's
-#     [broadcast_arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.broadcast_arrays.html)
-#     function, this function returns the input `arrays` with enough elements
-#     duplicated that they can be combined element-by-element.
-
-#     For NumPy arrays, this means that scalars are replaced with arrays with
-#     the same scalar value repeated at every element of the array, and regular
-#     dimensions are created to increase low-dimensional data into
-#     high-dimensional data.
-
-#     For example,
-
-#         >>> ak.broadcast_arrays(5,
-#         ...                     [1, 2, 3, 4, 5])
-#         [<Array [5, 5, 5, 5, 5] type='5 * int64'>,
-#          <Array [1, 2, 3, 4, 5] type='5 * int64'>]
-
-#     and
-
-#         >>> ak.broadcast_arrays(np.array([1, 2, 3]),
-#         ...                     np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
-#         [<Array [[  1,   2,   3], [ 1,  2,  3]] type='2 * 3 * int64'>,
-#          <Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]
-
-#     Note that in the second example, when the `3 * int64` array is expanded
-#     to match the `2 * 3 * float64` array, it is the deepest dimension that
-#     is aligned. If we try to match a `2 * int64` with the `2 * 3 * float64`,
-
-#         >>> ak.broadcast_arrays(np.array([1, 2]),
-#         ...                     np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
-#         ValueError: cannot broadcast RegularArray of size 2 with RegularArray of size 3
-
-#     NumPy has the same behavior: arrays with different numbers of dimensions
-#     are aligned to the right before expansion. One can control this by
-#     explicitly adding a new axis (reshape to add a dimension of length 1)
-#     where the expansion is supposed to take place because a dimension of
-#     length 1 can be expanded like a scalar.
-
-#         >>> ak.broadcast_arrays(np.array([1, 2])[:, np.newaxis],
-#         ...                     np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
-#         [<Array [[  1,   1,   1], [ 2,  2,  2]] type='2 * 3 * int64'>,
-#          <Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]
-
-#     Again, NumPy does the same thing (`np.newaxis` is equal to None, so this
-#     trick is often shown with None in the slice-tuple). Where the broadcasting
-#     happens can be controlled, but numbers of dimensions that don't match are
-#     implicitly aligned to the right (fitting innermost structure, not
-#     outermost).
-
-#     While that might be an arbitrary decision for rectilinear arrays, it is
-#     much more natural for implicit broadcasting to align left for tree-like
-#     structures. That is, the root of each data structure should agree and
-#     leaves may be duplicated to match. For example,
-
-#         >>> ak.broadcast_arrays([            100,   200,        300],
-#         ...                     [[1.1, 2.2, 3.3],    [], [4.4, 5.5]])
-#         [<Array [[100, 100, 100], [], [300, 300]] type='3 * var * int64'>,
-#          <Array [[1.1, 2.2, 3.3], [], [4.4, 5.5]] type='3 * var * float64'>]
-
-#     One typically wants single-item-per-element data to be duplicated to
-#     match multiple-items-per-element data. Operations on the broadcasted
-#     arrays like
-
-#         one_dimensional + nested_lists
-
-#     would then have the same effect as the procedural code
-
-#         for x, outer in zip(one_dimensional, nested_lists):
-#             output = []
-#             for inner in outer:
-#                 output.append(x + inner)
-#             yield output
-
-#     where `x` has the same value for each `inner` in the inner loop.
-
-#     Awkward Array's broadcasting manages to have it both ways by applying the
-#     following rules:
-
-#        * If all dimensions are regular (i.e. #ak.types.RegularType), like NumPy,
-#          implicit broadcasting aligns to the right, like NumPy.
-#        * If any dimension is variable (i.e. #ak.types.ListType), which can
-#          never be true of NumPy, implicit broadcasting aligns to the left.
-#        * Explicit broadcasting with a length-1 regular dimension always
-#          broadcasts, like NumPy.
-
-#     Thus, it is important to be aware of the distinction between a dimension
-#     that is declared to be regular in the type specification and a dimension
-#     that is allowed to be variable (even if it happens to have the same length
-#     for all elements). This distinction is can be accessed through the
-#     #ak.Array.type, but it is lost when converting an array into JSON or
-#     Python objects.
-#     """
-#     (highlevel, left_broadcast, right_broadcast) = ak._v2._util.extra(
-#         (),
-#         kwargs,
-#         [("highlevel", True), ("left_broadcast", True), ("right_broadcast", True)],
-#     )
-
-#     inputs = []
-#     for x in arrays:
-#         y = ak._v2.operations.convert.to_layout(x, allow_record=True, allow_other=True)
-#         if isinstance(y, ak.partition.PartitionedArray):   # NO PARTITIONED ARRAY
-#             y = y.toContent()
-#         if not isinstance(y, (ak._v2.contents.Content, ak._v2.contents.Record)):
-#             y = ak._v2.contents.NumpyArray(ak.nplike.of(*arrays).array([y]))
-#         inputs.append(y)
-
-#     def getfunction(inputs):
-#         if all(isinstance(x, ak._v2.contents.NumpyArray) for x in inputs):
-#             return lambda: tuple(inputs)
-#         else:
-#             return None
-
-#     behavior = ak._v2._util.behaviorof(*arrays)
-#     out = ak._v2._util.broadcast_and_apply(
-#         inputs,
-#         getfunction,
-#         behavior,
-#         left_broadcast=left_broadcast,
-#         right_broadcast=right_broadcast,
-#         pass_depth=False,
-#         numpy_to_regular=True,
-#     )
-#     assert isinstance(out, tuple)
-#     if highlevel:
-#         return [ak._v2._util.wrap(x, behavior) for x in out]
-#     else:
-#         return list(out)
+    """
+    Args:
+        arrays: Arrays to broadcast into the same structure.
+        left_broadcast (bool): If True, follow rules for implicit
+            left-broadcasting, as described below.
+        right_broadcast (bool): If True, follow rules for implicit
+            right-broadcasting, as described below.
+        highlevel (bool, default is True): If True, return an #ak.Array;
+            otherwise, return a low-level #ak.layout.Content subclass.
+
+    Like NumPy's
+    [broadcast_arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.broadcast_arrays.html)
+    function, this function returns the input `arrays` with enough elements
+    duplicated that they can be combined element-by-element.
+
+    For NumPy arrays, this means that scalars are replaced with arrays with
+    the same scalar value repeated at every element of the array, and regular
+    dimensions are created to increase low-dimensional data into
+    high-dimensional data.
+
+    For example,
+
+        >>> ak.broadcast_arrays(5,
+        ...                     [1, 2, 3, 4, 5])
+        [<Array [5, 5, 5, 5, 5] type='5 * int64'>,
+         <Array [1, 2, 3, 4, 5] type='5 * int64'>]
+
+    and
+
+        >>> ak.broadcast_arrays(np.array([1, 2, 3]),
+        ...                     np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
+        [<Array [[  1,   2,   3], [ 1,  2,  3]] type='2 * 3 * int64'>,
+         <Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]
+
+    Note that in the second example, when the `3 * int64` array is expanded
+    to match the `2 * 3 * float64` array, it is the deepest dimension that
+    is aligned. If we try to match a `2 * int64` with the `2 * 3 * float64`,
+
+        >>> ak.broadcast_arrays(np.array([1, 2]),
+        ...                     np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
+        ValueError: cannot broadcast RegularArray of size 2 with RegularArray of size 3
+
+    NumPy has the same behavior: arrays with different numbers of dimensions
+    are aligned to the right before expansion. One can control this by
+    explicitly adding a new axis (reshape to add a dimension of length 1)
+    where the expansion is supposed to take place because a dimension of
+    length 1 can be expanded like a scalar.
+
+        >>> ak.broadcast_arrays(np.array([1, 2])[:, np.newaxis],
+        ...                     np.array([[0.1, 0.2, 0.3], [10, 20, 30]]))
+        [<Array [[  1,   1,   1], [ 2,  2,  2]] type='2 * 3 * int64'>,
+         <Array [[0.1, 0.2, 0.3], [10, 20, 30]] type='2 * 3 * float64'>]
+
+    Again, NumPy does the same thing (`np.newaxis` is equal to None, so this
+    trick is often shown with None in the slice-tuple). Where the broadcasting
+    happens can be controlled, but numbers of dimensions that don't match are
+    implicitly aligned to the right (fitting innermost structure, not
+    outermost).
+
+    While that might be an arbitrary decision for rectilinear arrays, it is
+    much more natural for implicit broadcasting to align left for tree-like
+    structures. That is, the root of each data structure should agree and
+    leaves may be duplicated to match. For example,
+
+        >>> ak.broadcast_arrays([            100,   200,        300],
+        ...                     [[1.1, 2.2, 3.3],    [], [4.4, 5.5]])
+        [<Array [[100, 100, 100], [], [300, 300]] type='3 * var * int64'>,
+         <Array [[1.1, 2.2, 3.3], [], [4.4, 5.5]] type='3 * var * float64'>]
+
+    One typically wants single-item-per-element data to be duplicated to
+    match multiple-items-per-element data. Operations on the broadcasted
+    arrays like
+
+        one_dimensional + nested_lists
+
+    would then have the same effect as the procedural code
+
+        for x, outer in zip(one_dimensional, nested_lists):
+            output = []
+            for inner in outer:
+                output.append(x + inner)
+            yield output
+
+    where `x` has the same value for each `inner` in the inner loop.
+
+    Awkward Array's broadcasting manages to have it both ways by applying the
+    following rules:
+
+       * If all dimensions are regular (i.e. #ak.types.RegularType), like NumPy,
+         implicit broadcasting aligns to the right, like NumPy.
+       * If any dimension is variable (i.e. #ak.types.ListType), which can
+         never be true of NumPy, implicit broadcasting aligns to the left.
+       * Explicit broadcasting with a length-1 regular dimension always
+         broadcasts, like NumPy.
+
+    Thus, it is important to be aware of the distinction between a dimension
+    that is declared to be regular in the type specification and a dimension
+    that is allowed to be variable (even if it happens to have the same length
+    for all elements). This distinction is can be accessed through the
+    #ak.Array.type, but it is lost when converting an array into JSON or
+    Python objects.
+    """
+    with ak._v2._util.OperationErrorContext(
+        "ak._v2.broadcast_arrays",
+        dict(arrays=arrays, kwargs=kwargs),
+    ):
+        return _impl(arrays, kwargs)
+
+
+def _impl(arrays, kwargs):
+    (highlevel, left_broadcast, right_broadcast) = ak._v2._util.extra(
+        (),
+        kwargs,
+        [("highlevel", True), ("left_broadcast", True), ("right_broadcast", True)],
+    )
+
+    inputs = []
+    for x in arrays:
+        y = ak._v2.operations.convert.to_layout(x, allow_record=True, allow_other=True)
+        if not isinstance(y, (ak._v2.contents.Content, ak._v2.Record)):
+            y = ak._v2.contents.NumpyArray(ak.nplike.of(*arrays).array([y]))
+        inputs.append(y)
+
+    def action(inputs, **kwargs):
+        if all(isinstance(x, ak._v2.contents.NumpyArray) for x in inputs):
+            return tuple(inputs)
+        else:
+            return None
+
+    behavior = ak._v2._util.behavior_of(*arrays)
+    out = ak._v2._broadcasting.broadcast_and_apply(
+        inputs,
+        action,
+        behavior,
+        left_broadcast=left_broadcast,
+        right_broadcast=right_broadcast,
+        numpy_to_regular=True,
+    )
+    assert isinstance(out, tuple)
+    if highlevel:
+        return [ak._v2._util.wrap(x, behavior) for x in out]
+    else:
+        return list(out)
diff --git a/tests/v2/test_0119-numexpr-and-broadcast-arrays.py b/tests/v2/test_0119-numexpr-and-broadcast-arrays.py
@@ -0,0 +1,36 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+
+import pytest  # noqa: F401
+import numpy as np  # noqa: F401
+import awkward as ak  # noqa: F401
+
+to_list = ak._v2.operations.convert.to_list
+
+
+def test_broadcast_arrays():
+    a = ak._v2.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]], check_valid=True)
+    b = ak._v2.Array([100, 200, 300], check_valid=True)
+
+    out = ak._v2.operations.structure.broadcast_arrays(a, b)
+    assert to_list(out[0]) == [[1.1, 2.2, 3.3], [], [4.4, 5.5]]
+    assert to_list(out[1]) == [[100, 100, 100], [], [300, 300]]
+
+
+numexpr = pytest.importorskip("numexpr")
+
+
+@pytest.mark.skip(reason="FIXME: ak.numexpr")
+def test_numexpr():
+    # NumExpr's interface pulls variables from the surrounding scope,
+    # so these F841 "unused variables" actually are used.
+
+    a = ak._v2.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]], check_valid=True)  # noqa: F841
+    b = ak._v2.Array([100, 200, 300], check_valid=True)  # noqa: F841
+    assert to_list(ak.numexpr.evaluate("a + b")) == [
+        [101.1, 102.2, 103.3],
+        [],
+        [304.4, 305.5],
+    ]
+    a = [1, 2, 3]  # noqa: F841
+    assert to_list(ak.numexpr.re_evaluate()) == [101, 202, 303]
diff --git a/tests/v2/test_0334-fully-broadcastable-where.py b/tests/v2/test_0334-fully-broadcastable-where.py
@@ -25,9 +25,6 @@ def test():
     ]
 
 
-@pytest.mark.skip(
-    reason="ak._v2.operations.structure.broadcast_arrays to be implemented."
-)
 def test_issue_334():
     a = ak._v2.highlevel.Array([1, 2, 3, 4])
     b = ak._v2.highlevel.Array([-1])

diff --git a/tests/v2/test_0806-empty-lists-cartesian-fix.py b/tests/v2/test_0806-empty-lists-cartesian-fix.py
@@ -45,6 +45,8 @@ def test_cartesian():
     )
     assert to_list(result) == [None, None]
 
-    one, two = ak.broadcast_arrays(candidate, ak._v2.Array([[1, 2, 3], []]))
+    one, two = ak._v2.operations.structure.broadcast_arrays(
+        candidate, ak._v2.Array([[1, 2, 3], []])
+    )
     assert to_list(one) == [None, None]
     assert to_list(two) == [None, None]
diff --git a/tests/v2/test_1017-numpyarray-broadcast.py b/tests/v2/test_1017-numpyarray-broadcast.py
@@ -0,0 +1,19 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+
+import pytest  # noqa: F401
+import numpy as np  # noqa: F401
+import awkward as ak  # noqa: F401
+
+
+def test():
+    x = ak._v2.contents.ListOffsetArray(
+        ak._v2.index.Index64(np.array([0, 2, 2, 4, 6])),
+        ak._v2.contents.NumpyArray(np.arange(8 * 8).reshape(8, -1)),
+    )
+    y = ak._v2.contents.ListOffsetArray(
+        ak._v2.index.Index64(np.array([0, 2, 2, 4, 6])),
+        ak._v2.contents.NumpyArray(np.arange(8)),
+    )
+    u, v = ak._v2.operations.structure.broadcast_arrays(x, y)
+    assert u.ndim == v.ndim