From 6e6a17a7de77fa62f06680449bff078f49c95f00 Mon Sep 17 00:00:00 2001 From: Ricardo Vieira Date: Tue, 13 Aug 2024 01:12:00 +0200 Subject: [PATCH 1/3] Cleanup elemwise_cgen.py --- pytensor/tensor/elemwise_cgen.py | 204 ++++++++++++++----------------- 1 file changed, 92 insertions(+), 112 deletions(-) diff --git a/pytensor/tensor/elemwise_cgen.py b/pytensor/tensor/elemwise_cgen.py index 3e37bf7d1a..397285bc82 100644 --- a/pytensor/tensor/elemwise_cgen.py +++ b/pytensor/tensor/elemwise_cgen.py @@ -1,3 +1,5 @@ +from textwrap import dedent, indent + from pytensor.configdefaults import config @@ -8,12 +10,10 @@ def make_declare(loop_orders, dtypes, sub): """ decl = "" for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): - var = sub[f"lv{int(i)}"] # input name corresponding to ith loop variable + var = sub[f"lv{i}"] # input name corresponding to ith loop variable # we declare an iteration variable # and an integer for the number of dimensions - decl += f""" - {dtype}* {var}_iter; - """ + decl += f"{dtype}* {var}_iter;\n" for j, value in enumerate(loop_order): if value != "x": # If the dimension is not broadcasted, we declare @@ -21,17 +21,15 @@ def make_declare(loop_orders, dtypes, sub): # the stride in that dimension, # and the jump from an iteration to the next decl += f""" - npy_intp {var}_n{int(value)}; - ssize_t {var}_stride{int(value)}; - int {var}_jump{int(value)}_{int(j)}; + npy_intp {var}_n{value}; + ssize_t {var}_stride{value}; + int {var}_jump{value}_{j}; """ else: # if the dimension is broadcasted, we only need # the jump (arbitrary length and stride = 0) - decl += f""" - int {var}_jump{value}_{int(j)}; - """ + decl += f"int {var}_jump{value}_{j};\n" return decl @@ -39,7 +37,7 @@ def make_declare(loop_orders, dtypes, sub): def make_checks(loop_orders, dtypes, sub): init = "" for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): - var = f"%(lv{int(i)})s" + var = sub[f"lv{i}"] # List of dimensions of var that are not broadcasted nonx = [x for x in loop_order if x != "x"] if nonx: @@ -47,12 +45,14 @@ def make_checks(loop_orders, dtypes, sub): # this is a check that the number of dimensions of the # tensor is as expected. min_nd = max(nonx) + 1 - init += f""" - if (PyArray_NDIM({var}) < {min_nd}) {{ - PyErr_SetString(PyExc_ValueError, "Not enough dimensions on input."); - %(fail)s - }} - """ + init += dedent( + f""" + if (PyArray_NDIM({var}) < {min_nd}) {{ + PyErr_SetString(PyExc_ValueError, "Not enough dimensions on input."); + {indent(sub["fail"], " " * 12)} + }} + """ + ) # In loop j, adjust represents the difference of values of the # data pointer between the beginning and the end of the @@ -75,9 +75,7 @@ def make_checks(loop_orders, dtypes, sub): adjust = f"{var}_n{index}*{var}_stride{index}" else: jump = f"-({adjust})" - init += f""" - {var}_jump{index}_{j} = {jump}; - """ + init += f"{var}_jump{index}_{j} = {jump};\n" adjust = "0" check = "" @@ -101,34 +99,36 @@ def make_checks(loop_orders, dtypes, sub): j0, x0 = to_compare[0] for j, x in to_compare[1:]: - check += f""" - if (%(lv{j0})s_n{x0} != %(lv{j})s_n{x}) - {{ - if (%(lv{j0})s_n{x0} == 1 || %(lv{j})s_n{x} == 1) + check += dedent( + f""" + if ({sub[f"lv{j0}"]}_n{x0} != {sub[f"lv{j}"]}_n{x}) {{ - PyErr_Format(PyExc_ValueError, "{runtime_broadcast_error_msg}", - {j0}, - {x0}, - (long long int) %(lv{j0})s_n{x0}, - {j}, - {x}, - (long long int) %(lv{j})s_n{x} - ); - }} else {{ - PyErr_Format(PyExc_ValueError, "Input dimension mismatch: (input[%%i].shape[%%i] = %%lld, input[%%i].shape[%%i] = %%lld)", + if ({sub[f"lv{j0}"]}_n{x0} == 1 || {sub[f"lv{j}"]}_n{x} == 1) + {{ + PyErr_Format(PyExc_ValueError, "{runtime_broadcast_error_msg}", {j0}, {x0}, - (long long int) %(lv{j0})s_n{x0}, + (long long int) {sub[f"lv{j0}"]}_n{x0}, {j}, {x}, - (long long int) %(lv{j})s_n{x} - ); + (long long int) {sub[f"lv{j}"]}_n{x} + ); + }} else {{ + PyErr_Format(PyExc_ValueError, "Input dimension mismatch: (input[%%i].shape[%%i] = %%lld, input[%%i].shape[%%i] = %%lld)", + {j0}, + {x0}, + (long long int) {sub[f"lv{j0}"]}_n{x0}, + {j}, + {x}, + (long long int) {sub[f"lv{j}"]}_n{x} + ); + }} + {sub["fail"]} }} - %(fail)s - }} - """ + """ + ) - return init % sub + check % sub + return init + check def compute_output_dims_lengths(array_name: str, loop_orders, sub) -> str: @@ -144,7 +144,7 @@ def compute_output_dims_lengths(array_name: str, loop_orders, sub) -> str: # Borrow the length of the first non-broadcastable input dimension for j, candidate in enumerate(candidates): if candidate != "x": - var = sub[f"lv{int(j)}"] + var = sub[f"lv{j}"] dims_c_code += f"{array_name}[{i}] = {var}_n{candidate};\n" break # If none is non-broadcastable, the output dimension has a length of 1 @@ -177,35 +177,37 @@ def make_alloc(loop_orders, dtype, sub, fortran="0"): # way that its contiguous dimensions match one of the input's # contiguous dimensions, or the dimension with the smallest # stride. Right now, it is allocated to be C_CONTIGUOUS. - return f""" - {{ - npy_intp dims[{nd}]; - //npy_intp* dims = (npy_intp*)malloc({nd} * sizeof(npy_intp)); - {init_dims} - if (!{olv}) {{ - {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, dims, - {type}, - {fortran}); - }} - else {{ - PyArray_Dims new_dims; - new_dims.len = {nd}; - new_dims.ptr = dims; - PyObject* success = PyArray_Resize({olv}, &new_dims, 0, NPY_CORDER); - if (!success) {{ - // If we can't resize the ndarray we have we can allocate a new one. - PyErr_Clear(); - Py_XDECREF({olv}); - {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, dims, {type}, 0); - }} else {{ - Py_DECREF(success); + return dedent( + f""" + {{ + npy_intp dims[{nd}]; + {init_dims} + if (!{olv}) {{ + {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, + dims, + {type}, + {fortran}); + }} + else {{ + PyArray_Dims new_dims; + new_dims.len = {nd}; + new_dims.ptr = dims; + PyObject* success = PyArray_Resize({olv}, &new_dims, 0, NPY_CORDER); + if (!success) {{ + // If we can't resize the ndarray we have we can allocate a new one. + PyErr_Clear(); + Py_XDECREF({olv}); + {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, dims, {type}, 0); + }} else {{ + Py_DECREF(success); + }} + }} + if (!{olv}) {{ + {fail} }} }} - if (!{olv}) {{ - {fail} - }} - }} - """ + """ + ) def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None): @@ -235,11 +237,11 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None): """ def loop_over(preloop, code, indices, i): - iterv = f"ITER_{int(i)}" + iterv = f"ITER_{i}" update = "" suitable_n = "1" for j, index in enumerate(indices): - var = sub[f"lv{int(j)}"] + var = sub[f"lv{j}"] dtype = dtypes[j] update += f"{dtype} &{var}_i = * ( {var}_iter + {iterv} * {var}_jump{index}_{i} );\n" @@ -305,13 +307,13 @@ def make_reordered_loop( nnested = len(init_loop_orders[0]) # This is the var from which we'll get the loop order - ovar = sub[f"lv{int(olv_index)}"] + ovar = sub[f"lv{olv_index}"] # The loops are ordered by (decreasing) absolute values of ovar's strides. # The first element of each pair is the absolute value of the stride # The second element correspond to the index in the initial loop order order_loops = f""" - std::vector< std::pair > {ovar}_loops({int(nnested)}); + std::vector< std::pair > {ovar}_loops({nnested}); std::vector< std::pair >::iterator {ovar}_loops_it = {ovar}_loops.begin(); """ @@ -319,7 +321,7 @@ def make_reordered_loop( for i, index in enumerate(init_loop_orders[olv_index]): if index != "x": order_loops += f""" - {ovar}_loops_it->first = abs(PyArray_STRIDES({ovar})[{int(index)}]); + {ovar}_loops_it->first = abs(PyArray_STRIDES({ovar})[{index}]); """ else: # Stride is 0 when dimension is broadcastable @@ -328,7 +330,7 @@ def make_reordered_loop( """ order_loops += f""" - {ovar}_loops_it->second = {int(i)}; + {ovar}_loops_it->second = {i}; ++{ovar}_loops_it; """ @@ -352,7 +354,7 @@ def make_reordered_loop( for i in range(nnested): declare_totals += f""" - int TOTAL_{int(i)} = init_totals[{ovar}_loops_it->second]; + int TOTAL_{i} = init_totals[{ovar}_loops_it->second]; ++{ovar}_loops_it; """ @@ -365,7 +367,7 @@ def get_loop_strides(loop_order, i): specified loop_order. """ - var = sub[f"lv{int(i)}"] + var = sub[f"lv{i}"] r = [] for index in loop_order: # Note: the stride variable is not declared for broadcasted variables @@ -383,7 +385,7 @@ def get_loop_strides(loop_order, i): ) declare_strides = f""" - int init_strides[{int(nvars)}][{int(nnested)}] = {{ + int init_strides[{nvars}][{nnested}] = {{ {strides} }};""" @@ -394,33 +396,33 @@ def get_loop_strides(loop_order, i): """ for i in range(nvars): - var = sub[f"lv{int(i)}"] + var = sub[f"lv{i}"] declare_strides += f""" {ovar}_loops_rit = {ovar}_loops.rbegin();""" for j in reversed(range(nnested)): declare_strides += f""" - int {var}_stride_l{int(j)} = init_strides[{int(i)}][{ovar}_loops_rit->second]; + int {var}_stride_l{j} = init_strides[{i}][{ovar}_loops_rit->second]; ++{ovar}_loops_rit; """ declare_iter = "" for i, dtype in enumerate(dtypes): - var = sub[f"lv{int(i)}"] + var = sub[f"lv{i}"] declare_iter += f"{var}_iter = ({dtype}*)(PyArray_DATA({var}));\n" pointer_update = "" for j, dtype in enumerate(dtypes): - var = sub[f"lv{int(j)}"] + var = sub[f"lv{j}"] pointer_update += f"{dtype} &{var}_i = * ( {var}_iter" for i in reversed(range(nnested)): - iterv = f"ITER_{int(i)}" - pointer_update += f"+{var}_stride_l{int(i)}*{iterv}" + iterv = f"ITER_{i}" + pointer_update += f"+{var}_stride_l{i}*{iterv}" pointer_update += ");\n" loop = inner_task for i in reversed(range(nnested)): - iterv = f"ITER_{int(i)}" - total = f"TOTAL_{int(i)}" + iterv = f"ITER_{i}" + total = f"TOTAL_{i}" update = "" forloop = "" # The pointers are defined only in the most inner loop @@ -434,36 +436,14 @@ def get_loop_strides(loop_order, i): loop = f""" {forloop} - {{ // begin loop {int(i)} + {{ // begin loop {i} {update} {loop} - }} // end loop {int(i)} + }} // end loop {i} """ - return f"{{\n{order_loops}\n{declare_totals}\n{declare_strides}\n{declare_iter}\n{loop}\n}}\n" - - -# print make_declare(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)), -# ('double', 'int', 'float'), -# dict(lv0='x', lv1='y', lv2='z', fail="FAIL;")) - -# print make_checks(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)), -# ('double', 'int', 'float'), -# dict(lv0='x', lv1='y', lv2='z', fail="FAIL;")) - -# print make_alloc(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)), -# 'double', -# dict(olv='out', lv0='x', lv1='y', lv2='z', fail="FAIL;")) - -# print make_loop(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)), -# ('double', 'int', 'float'), -# (("C00;", "C%01;"), ("C10;", "C11;"), ("C20;", "C21;"), ("C30;", "C31;"),"C4;"), -# dict(lv0='x', lv1='y', lv2='z', fail="FAIL;")) - -# print make_loop(((0, 1, 2, 3), (3, 'x', 0, 'x'), (0, 'x', 'x', 'x')), -# ('double', 'int', 'float'), -# (("C00;", "C01;"), ("C10;", "C11;"), ("C20;", "C21;"), ("C30;", "C31;"),"C4;"), -# dict(lv0='x', lv1='y', lv2='z', fail="FAIL;")) + code = "\n".join((order_loops, declare_totals, declare_strides, declare_iter, loop)) + return f"{{\n{code}\n}}\n" ################## From 1a944b7978b4e1f2d65f07220bc71126c6a2e8c7 Mon Sep 17 00:00:00 2001 From: Ricardo Vieira Date: Tue, 6 Aug 2024 16:17:49 +0200 Subject: [PATCH 2/3] Add benchmark test for CAReduce --- tests/tensor/test_elemwise.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/tensor/test_elemwise.py b/tests/tensor/test_elemwise.py index 94e91821fa..284e8051a7 100644 --- a/tests/tensor/test_elemwise.py +++ b/tests/tensor/test_elemwise.py @@ -985,3 +985,29 @@ def test_CAReduce(self): assert isinstance(vect_node.op, Any) assert vect_node.op.axis == (1,) assert vect_node.inputs[0] is bool_tns + + +@pytest.mark.parametrize( + "axis", + (0, 1, 2, (0, 1), (0, 2), (1, 2), None), + ids=lambda x: f"axis={x}", +) +@pytest.mark.parametrize( + "c_contiguous", + (True, False), + ids=lambda x: f"c_contiguous={x}", +) +def test_careduce_benchmark(axis, c_contiguous, benchmark): + N = 256 + x_test = np.random.uniform(size=(N, N, N)) + transpose_axis = (0, 1, 2) if c_contiguous else (2, 0, 1) + + x = pytensor.shared(x_test, name="x", shape=x_test.shape) + out = x.transpose(transpose_axis).sum(axis=axis) + fn = pytensor.function([], out) + + np.testing.assert_allclose( + fn(), + x_test.transpose(transpose_axis).sum(axis=axis), + ) + benchmark(fn) From 061ed3acdfb5eb14b0e04d9c504df3cb64194b4c Mon Sep 17 00:00:00 2001 From: Ricardo Vieira Date: Sun, 4 Aug 2024 19:17:05 +0200 Subject: [PATCH 3/3] CAReduce loop reordering C-impl --- pytensor/tensor/elemwise.py | 202 ++++++++--------- pytensor/tensor/elemwise_cgen.py | 373 +++++++++++++++++++++++++------ pytensor/tensor/math.py | 31 +-- 3 files changed, 418 insertions(+), 188 deletions(-) diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py index de966f1a78..7b446bcfbf 100644 --- a/pytensor/tensor/elemwise.py +++ b/pytensor/tensor/elemwise.py @@ -1,4 +1,5 @@ from copy import copy +from textwrap import dedent import numpy as np from numpy.core.numeric import normalize_axis_tuple @@ -1466,15 +1467,16 @@ def infer_shape(self, fgraph, node, shapes): return ((),) return ([ishape[i] for i in range(node.inputs[0].type.ndim) if i not in axis],) - def _c_all(self, node, name, inames, onames, sub): - input = node.inputs[0] - output = node.outputs[0] + def _c_all(self, node, name, input_names, output_names, sub): + [inp] = node.inputs + [out] = node.outputs + ndim = inp.type.ndim - iname = inames[0] - oname = onames[0] + [inp_name] = input_names + [out_name] = output_names - idtype = input.type.dtype_specs()[1] - odtype = output.type.dtype_specs()[1] + inp_dtype = inp.type.dtype_specs()[1] + out_dtype = out.type.dtype_specs()[1] acc_dtype = getattr(self, "acc_dtype", None) @@ -1482,100 +1484,97 @@ def _c_all(self, node, name, inames, onames, sub): if acc_dtype == "float16": raise MethodNotDefined("no c_code for float16") acc_type = TensorType(shape=node.outputs[0].type.shape, dtype=acc_dtype) - adtype = acc_type.dtype_specs()[1] + acc_dtype = acc_type.dtype_specs()[1] else: - adtype = odtype + acc_dtype = out_dtype axis = self.axis if axis is None: - axis = list(range(input.type.ndim)) + axis = list(range(inp.type.ndim)) if len(axis) == 0: + # This is just an Elemwise cast operation # The acc_dtype is never a downcast compared to the input dtype # So we just need a cast to the output dtype. - var = pytensor.tensor.basic.cast(input, node.outputs[0].dtype) - if var is input: - var = Elemwise(scalar_identity)(input) + var = pytensor.tensor.basic.cast(inp, node.outputs[0].dtype) + if var is inp: + var = Elemwise(scalar_identity)(inp) assert var.dtype == node.outputs[0].dtype - return var.owner.op._c_all(var.owner, name, inames, onames, sub) - - order1 = [i for i in range(input.type.ndim) if i not in axis] - order = order1 + list(axis) + return var.owner.op._c_all(var.owner, name, input_names, output_names, sub) - nnested = len(order1) + inp_dims = list(range(ndim)) + non_reduced_dims = [i for i in inp_dims if i not in axis] + counter = iter(range(ndim)) + acc_dims = ["x" if i in axis else next(counter) for i in range(ndim)] - sub = dict(sub) - for i, (input, iname) in enumerate(zip(node.inputs, inames)): - sub[f"lv{i}"] = iname + sub = sub.copy() + sub["lv0"] = inp_name + sub["lv1"] = out_name + sub["olv"] = out_name - decl = "" - if adtype != odtype: + if acc_dtype != out_dtype: # Create an accumulator variable different from the output - aname = "acc" - decl = acc_type.c_declare(aname, sub) - decl += acc_type.c_init(aname, sub) + acc_name = "acc" + setup = acc_type.c_declare(acc_name, sub) + acc_type.c_init(acc_name, sub) else: # the output is the accumulator variable - aname = oname - - decl += cgen.make_declare([order], [idtype], sub) - checks = cgen.make_checks([order], [idtype], sub) - - alloc = "" - i += 1 - sub[f"lv{i}"] = oname - sub["olv"] = oname - - # Allocate output buffer - alloc += cgen.make_declare( - [list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname) - ) - alloc += cgen.make_alloc([order1], odtype, sub) - alloc += cgen.make_checks( - [list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname) + acc_name = out_name + setup = "" + + # Define strides of input array + setup += cgen.make_declare( + [inp_dims], [inp_dtype], sub, compute_stride_jump=False + ) + cgen.make_checks([inp_dims], [inp_dtype], sub, compute_stride_jump=False) + + # Define strides of output array and allocate it + out_sub = sub | {"lv0": out_name} + alloc = ( + cgen.make_declare( + [acc_dims], [out_dtype], out_sub, compute_stride_jump=False + ) + + cgen.make_alloc([non_reduced_dims], out_dtype, sub) + + cgen.make_checks( + [acc_dims], [out_dtype], out_sub, compute_stride_jump=False + ) ) - if adtype != odtype: - # Allocate accumulation buffer - sub[f"lv{i}"] = aname - sub["olv"] = aname + if acc_dtype != out_dtype: + # Define strides of accumulation buffer and allocate it + sub["lv1"] = acc_name + sub["olv"] = acc_name - alloc += cgen.make_declare( - [list(range(nnested)) + ["x"] * len(axis)], - [adtype], - dict(sub, lv0=aname), - ) - alloc += cgen.make_alloc([order1], adtype, sub) - alloc += cgen.make_checks( - [list(range(nnested)) + ["x"] * len(axis)], - [adtype], - dict(sub, lv0=aname), + acc_sub = sub | {"lv0": acc_name} + alloc += ( + cgen.make_declare( + [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False + ) + + cgen.make_alloc([non_reduced_dims], acc_dtype, sub) + + cgen.make_checks( + [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False + ) ) identity = self.scalar_op.identity - if np.isposinf(identity): - if input.type.dtype in ("float32", "float64"): + if inp.type.dtype in ("float32", "float64"): identity = "__builtin_inf()" - elif input.type.dtype.startswith("uint") or input.type.dtype == "bool": + elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool": identity = "1" else: - identity = "NPY_MAX_" + str(input.type.dtype).upper() + identity = "NPY_MAX_" + str(inp.type.dtype).upper() elif np.isneginf(identity): - if input.type.dtype in ("float32", "float64"): + if inp.type.dtype in ("float32", "float64"): identity = "-__builtin_inf()" - elif input.type.dtype.startswith("uint") or input.type.dtype == "bool": + elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool": identity = "0" else: - identity = "NPY_MIN_" + str(input.type.dtype).upper() + identity = "NPY_MIN_" + str(inp.type.dtype).upper() elif identity is None: raise TypeError(f"The {self.scalar_op} does not define an identity.") - task0_decl = f"{adtype}& {aname}_i = *{aname}_iter;\n{aname}_i = {identity};" - - task1_decl = f"{idtype}& {inames[0]}_i = *{inames[0]}_iter;\n" + initial_value = f"{acc_name}_i = {identity};" - task1_code = self.scalar_op.c_code( + inner_task = self.scalar_op.c_code( Apply( self.scalar_op, [ @@ -1588,44 +1587,45 @@ def _c_all(self, node, name, inames, onames, sub): ], ), None, - [f"{aname}_i", f"{inames[0]}_i"], - [f"{aname}_i"], + [f"{acc_name}_i", f"{inp_name}_i"], + [f"{acc_name}_i"], sub, ) - code1 = f""" - {{ - {task1_decl} - {task1_code} - }} - """ - if node.inputs[0].type.ndim: - if len(axis) == 1: - all_code = [("", "")] * nnested + [(task0_decl, code1), ""] - else: - all_code = ( - [("", "")] * nnested - + [(task0_decl, "")] - + [("", "")] * (len(axis) - 2) - + [("", code1), ""] - ) + if out.type.ndim == 0: + # Simple case where everything is reduced, no need for loop ordering + loop = cgen.make_complete_loop_careduce( + inp_var=inp_name, + acc_var=acc_name, + inp_dtype=inp_dtype, + acc_dtype=acc_dtype, + initial_value=initial_value, + inner_task=inner_task, + fail_code=sub["fail"], + ) else: - all_code = [task0_decl + code1] - loop = cgen.make_loop_careduce( - [order, list(range(nnested)) + ["x"] * len(axis)], - [idtype, adtype], - all_code, - sub, - ) + loop = cgen.make_reordered_loop_careduce( + inp_var=inp_name, + acc_var=acc_name, + inp_dtype=inp_dtype, + acc_dtype=acc_dtype, + inp_ndim=ndim, + reduction_axes=axis, + initial_value=initial_value, + inner_task=inner_task, + ) - end = "" - if adtype != odtype: - end = f""" - PyArray_CopyInto({oname}, {aname}); - """ - end += acc_type.c_cleanup(aname, sub) + if acc_dtype != out_dtype: + cast = dedent( + f""" + PyArray_CopyInto({out_name}, {acc_name}); + {acc_type.c_cleanup(acc_name, sub)} + """ + ) + else: + cast = "" - return decl, checks, alloc, loop, end + return setup, alloc, loop, cast def c_code(self, node, name, inames, onames, sub): code = "\n".join(self._c_all(node, name, inames, onames, sub)) @@ -1637,7 +1637,7 @@ def c_headers(self, **kwargs): def c_code_cache_version_apply(self, node): # the version corresponding to the c code in this Op - version = [9] + version = [10] # now we insert versions for the ops on which we depend... scalar_node = Apply( diff --git a/pytensor/tensor/elemwise_cgen.py b/pytensor/tensor/elemwise_cgen.py index 397285bc82..7eb422aa0a 100644 --- a/pytensor/tensor/elemwise_cgen.py +++ b/pytensor/tensor/elemwise_cgen.py @@ -1,9 +1,10 @@ +from collections.abc import Sequence from textwrap import dedent, indent from pytensor.configdefaults import config -def make_declare(loop_orders, dtypes, sub): +def make_declare(loop_orders, dtypes, sub, compute_stride_jump=True): """ Produce code to declare all necessary variables. @@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub): # the number of elements in that dimension, # the stride in that dimension, # and the jump from an iteration to the next - decl += f""" - npy_intp {var}_n{value}; - ssize_t {var}_stride{value}; - int {var}_jump{value}_{j}; - """ + decl += f"npy_intp {var}_n{value};\nssize_t {var}_stride{value};\n" + if compute_stride_jump: + decl += f"int {var}_jump{value}_{j};\n" - else: + elif compute_stride_jump: # if the dimension is broadcasted, we only need # the jump (arbitrary length and stride = 0) decl += f"int {var}_jump{value}_{j};\n" @@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub): return decl -def make_checks(loop_orders, dtypes, sub): +def make_checks(loop_orders, dtypes, sub, compute_stride_jump=True): init = "" for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): var = sub[f"lv{i}"] @@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub): # Initialize the variables associated to the jth loop # jump = stride - adjust jump = f"({var}_stride{index}) - ({adjust})" - init += f""" - {var}_n{index} = PyArray_DIMS({var})[{index}]; - {var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype}); - {var}_jump{index}_{j} = {jump}; - """ + init += f"{var}_n{index} = PyArray_DIMS({var})[{index}];\n" + init += f"{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});\n" + if compute_stride_jump: + init += f"{var}_jump{index}_{j} = {jump};\n" adjust = f"{var}_n{index}*{var}_stride{index}" - else: + + elif compute_stride_jump: jump = f"-({adjust})" init += f"{var}_jump{index}_{j} = {jump};\n" adjust = "0" @@ -460,72 +459,298 @@ def get_loop_strides(loop_order, i): ################ -def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub): +def make_complete_loop_careduce( + inp_var: str, + acc_var: str, + inp_dtype: str, + acc_dtype: str, + initial_value: str, + inner_task: str, + fail_code, +) -> str: + """Generate C code for a complete reduction loop. + + The generated code for a float64 input variable `inp` and accumulation variable `acc` looks like: + + .. code-block:: C + { + NpyIter* iter; + NpyIter_IterNextFunc *iternext; + char** data_ptr; + npy_intp* stride_ptr,* innersize_ptr; + + // Special case for empty inputs + if (PyArray_SIZE(inp) == 0) { + npy_float64 acc_i = *(npy_float64*)(PyArray_DATA(acc)); + acc_i = 0; + }else{ + iter = NpyIter_New(inp, + NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + iternext = NpyIter_GetIterNext(iter, NULL); + if (iternext == NULL) { + NpyIter_Deallocate(iter); + { fail } + } + data_ptr = NpyIter_GetDataPtrArray(iter); + stride_ptr = NpyIter_GetInnerStrideArray(iter); + innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter); + + npy_float64 acc_i; + acc_i = 0; + do { + char* data = *data_ptr; + npy_intp stride = *stride_ptr; + npy_intp count = *innersize_ptr; + + while(count--) { + npy_float64 inp_i = *((npy_float64*)data); + acc_i = acc_i + inp_i; + data += stride; + } + + } while(iternext(iter)); + NpyIter_Deallocate(iter); + + *(npy_float64*)(PyArray_DATA(acc)) = acc_i; + } + } """ - Make a nested loop over several arrays and associate specific code - to each level of nesting. + return dedent( + f""" + {{ + NpyIter* iter; + NpyIter_IterNextFunc *iternext; + char** data_ptr; + npy_intp* stride_ptr,* innersize_ptr; + + // Special case for empty inputs + if (PyArray_SIZE({inp_var}) == 0) {{ + {acc_dtype} &{acc_var}_i = *({acc_dtype}*)(PyArray_DATA({acc_var})); + {initial_value} + }}else{{ + iter = NpyIter_New({inp_var}, + NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + + iternext = NpyIter_GetIterNext(iter, NULL); + if (iternext == NULL) {{ + NpyIter_Deallocate(iter); + {fail_code} + }} - Parameters - ---------- - loop_orders : list of N tuples of length M - Each value of each tuple can be either the index of a dimension to - loop over or the letter 'x' which means there is no looping to be done - over that variable at that point (in other words we broadcast - over that dimension). If an entry is an integer, it will become - an alias of the entry of that rank. - loop_tasks : list of M+1 pieces of code - The ith loop_task is a pair of strings, the first - string is code to be executed before the ith loop starts, the second - one contains code to be executed just before going to the next element - of the ith dimension. - The last element if loop_tasks is a single string, containing code - to be executed at the very end. - sub: dictionary - Maps 'lv#' to a suitable variable name. - The 'lvi' variable corresponds to the ith element of loop_orders. + data_ptr = NpyIter_GetDataPtrArray(iter); + stride_ptr = NpyIter_GetInnerStrideArray(iter); + innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter); - """ + {acc_dtype} {acc_var}_i; + {initial_value} - def loop_over(preloop, code, indices, i): - iterv = f"ITER_{int(i)}" - update = "" - suitable_n = "1" - for j, index in enumerate(indices): - var = sub[f"lv{int(j)}"] - update += f"{var}_iter += {var}_jump{index}_{i};\n" - if index != "x": - suitable_n = f"{var}_n{index}" - return f""" - {preloop} - for (int {iterv} = {suitable_n}; {iterv}; {iterv}--) {{ - {code} - {update} + do {{ + char* data = *data_ptr; + npy_intp stride = *stride_ptr; + npy_intp count = *innersize_ptr; + + while(count--) {{ + {inp_dtype} {inp_var}_i = *(({inp_dtype}*)data); + {inner_task} + data += stride; + }} + }} while(iternext(iter)); + + NpyIter_Deallocate(iter); + *({acc_dtype}*)(PyArray_DATA({acc_var})) = {acc_var}_i; + }} }} """ + ) - preloops = {} - for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)): - for j, index in enumerate(loop_order): - if index != "x": - preloops.setdefault(j, "") - preloops[j] += ( - f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n" - ) % sub - break - else: # all broadcastable - preloops.setdefault(0, "") - preloops[0] += ( - f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n" - ) % sub - if len(loop_tasks) == 1: - s = preloops.get(0, "") - else: - s = "" - for i, (pre_task, task), indices in reversed( - list(zip(range(len(loop_tasks) - 1), loop_tasks, list(zip(*loop_orders)))) - ): - s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i) +def make_reordered_loop_careduce( + inp_var: str, + acc_var: str, + inp_dtype: str, + acc_dtype: str, + inp_ndim: int, + reduction_axes: Sequence[int], + initial_value: str, + inner_task: str, +) -> str: + """Generate C code for a partial reduction loop, reordering for optimal memory access of the input variable. + + The generated code for a sum along the last axis of a 2D float64 input variable `inp` + in an accumulation variable `acc` looks like: + + .. code-block:: C + { + // Special case for empty inputs + if (PyArray_SIZE(inp) == 0) { + acc_iter = (npy_float64*)(PyArray_DATA(acc)); + int_n = PyArray_SIZE(acc); + for(int i = 0; i < n; i++) + { + npy_float64 &acc_i = acc_iter[i]; + acc_i = 0; + } + } else { + std::vector< std::pair > loops(2); + std::vector< std::pair >::iterator loops_it = loops.begin(); + + loops_it->first = abs(PyArray_STRIDES(inp)[0]); + loops_it->second = 0; + ++loops_it; + loops_it->first = abs(PyArray_STRIDES(inp)[1]); + loops_it->second = 1; + ++loops_it; + std::sort(loops.rbegin(), loops.rend()); + + int dim_lengths[2] = {inp_n0, inp_n1}; + int inp_strides[2] = {inp_stride0, inp_stride1}; + int acc_strides[2] = {acc_stride0, 0}; + bool reduction_axes[2] = {0, 1}; + + loops_it = loops.begin(); + int dim_length_0 = dim_lengths[loops_it->second]; + int is_reduction_axis_0 = reduction_axes[loops_it->second]; + int inp_stride_0 = inp_strides[loops_it->second]; + int acc_stride_0 = acc_strides[loops_it->second]; + ++loops_it; + int dim_length_1 = dim_lengths[loops_it->second]; + int is_reduction_axis_1 = reduction_axes[loops_it->second]; + int inp_stride_1 = inp_strides[loops_it->second]; + int acc_stride_1 = acc_strides[loops_it->second]; + ++loops_it; + + inp_iter = (npy_float64*)(PyArray_DATA(inp)); + acc_iter = (npy_float64*)(PyArray_DATA(acc)); + + for(int iter_0 = 0; iter_0 > loops({inp_ndim}); + std::vector< std::pair >::iterator loops_it = loops.begin(); + """ + ) + + # Fill the loop vector with the appropriate pairs + for i in range(inp_ndim): + order_loops += dedent( + f""" + loops_it->first = abs(PyArray_STRIDES({inp_var})[{i}]); + loops_it->second = {i}; + ++loops_it;""" + ) + + # We sort in decreasing order so that the outermost loop (loop 0) + # has the largest stride, and the innermost loop has the smallest stride. + order_loops += "\nstd::sort(loops.rbegin(), loops.rend());\n" + + # Sort shape and strides to match the new order that was computed by sorting the loop vector. + counter = iter(range(inp_ndim)) + unsorted_vars = dedent( + f""" + int dim_lengths[{inp_ndim}] = {{{','.join(f'{inp_var}_n{i}' for i in range(inp_ndim))}}}; + int inp_strides[{inp_ndim}] = {{{','.join(f'{inp_var}_stride{i}' for i in range(inp_ndim))}}}; + int acc_strides[{inp_ndim}] = {{{','.join("0" if i in reduction_axes else f'{acc_var}_stride{next(counter)}'for i in range(inp_ndim))}}}; + bool reduction_axes[{inp_ndim}] = {{{', '.join("1" if i in reduction_axes else "0" for i in range(inp_ndim))}}};\n + """ + ) + + sorted_vars = "loops_it = loops.begin();" + for i in range(inp_ndim): + sorted_vars += dedent( + f""" + int dim_length_{i} = dim_lengths[loops_it->second]; + int is_reduction_axis_{i} = reduction_axes[loops_it->second]; + int {inp_var}_stride_{i} = inp_strides[loops_it->second]; + int {acc_var}_stride_{i} = acc_strides[loops_it->second]; + ++loops_it; + """ + ) + + declare_iter = dedent( + f""" + {inp_var}_iter = ({inp_dtype}*)(PyArray_DATA({inp_var})); + {acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var})); + """ + ) + + pointer_update = "" + for var, dtype in ((inp_var, inp_dtype), (acc_var, acc_dtype)): + pointer_update += f"{dtype} &{var}_i = *({var}_iter" + for i in reversed(tuple(range(inp_ndim))): + iter_var = f"iter_{i}" + pointer_update += f" + {var}_stride_{i}*{iter_var}" + pointer_update += ");\n" + + # Set initial value in first iteration of each output + # This happens on the first iteration of every reduction axis + initial_iteration = " && ".join( + f"(!is_reduction_axis_{i} || iter_{i} == 0)" for i in range(inp_ndim) + ) + set_initial_value = dedent( + f""" + if({initial_iteration}) + {{ + {initial_value} + }} + """ + ) + + # We set do pointer_update, initial_value and inner task in inner loop + loop = "\n\n".join((pointer_update, set_initial_value, f"{{{inner_task}}}")) + + # Create outer loops recursively + for i in reversed(range(inp_ndim)): + iter_var = f"iter_{i}" + dim_length = f"dim_length_{i}" + loop = dedent( + f""" + for(int {iter_var} = 0; {iter_var}<{dim_length}; {iter_var}++){{ + {loop} + }} + """ + ) + + non_empty_case = "\n".join( + (order_loops, unsorted_vars, sorted_vars, declare_iter, loop) + ) + code = "\n".join((empty_case, non_empty_case, "}")) + return f"{{\n{code}\n}}\n" diff --git a/pytensor/tensor/math.py b/pytensor/tensor/math.py index 8619b124be..86df161fb6 100644 --- a/pytensor/tensor/math.py +++ b/pytensor/tensor/math.py @@ -1,6 +1,7 @@ import builtins import warnings from collections.abc import Sequence +from textwrap import dedent from typing import TYPE_CHECKING, Optional import numpy as np @@ -361,12 +362,14 @@ def __str__(self): class NonZeroDimsCAReduce(FixedOpCAReduce): - def _c_all(self, node, name, inames, onames, sub): - decl, checks, alloc, loop, end = super()._c_all(node, name, inames, onames, sub) + def _c_all(self, node, name, input_names, output_names, sub): + setup, alloc, loop, cast = super()._c_all( + node, name, input_names, output_names, sub + ) # We add an additional check for zero-sized dimensions (This seems like # something that could enabled in `elemwise_cgen.make_checks`.) - iname = inames[0] + [iname] = input_names axis = self.axis if axis is None: @@ -378,17 +381,19 @@ def _c_all(self, node, name, inames, onames, sub): pattern_ = str(pattern)[1:-1] - decl += f"""int tosum[]={{{pattern_}}};""" - alloc += f""" - for(int i=0;i