From 6e6a17a7de77fa62f06680449bff078f49c95f00 Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Tue, 13 Aug 2024 01:12:00 +0200
Subject: [PATCH 1/3] Cleanup elemwise_cgen.py

---
 pytensor/tensor/elemwise_cgen.py | 204 ++++++++++++++-----------------
 1 file changed, 92 insertions(+), 112 deletions(-)

diff --git a/pytensor/tensor/elemwise_cgen.py b/pytensor/tensor/elemwise_cgen.py
index 3e37bf7d1a..397285bc82 100644
--- a/pytensor/tensor/elemwise_cgen.py
+++ b/pytensor/tensor/elemwise_cgen.py
@@ -1,3 +1,5 @@
+from textwrap import dedent, indent
+
 from pytensor.configdefaults import config
 
 
@@ -8,12 +10,10 @@ def make_declare(loop_orders, dtypes, sub):
     """
     decl = ""
     for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-        var = sub[f"lv{int(i)}"]  # input name corresponding to ith loop variable
+        var = sub[f"lv{i}"]  # input name corresponding to ith loop variable
         # we declare an iteration variable
         # and an integer for the number of dimensions
-        decl += f"""
-        {dtype}* {var}_iter;
-        """
+        decl += f"{dtype}* {var}_iter;\n"
         for j, value in enumerate(loop_order):
             if value != "x":
                 # If the dimension is not broadcasted, we declare
@@ -21,17 +21,15 @@ def make_declare(loop_orders, dtypes, sub):
                 # the stride in that dimension,
                 # and the jump from an iteration to the next
                 decl += f"""
-                npy_intp {var}_n{int(value)};
-                ssize_t {var}_stride{int(value)};
-                int {var}_jump{int(value)}_{int(j)};
+                npy_intp {var}_n{value};
+                ssize_t {var}_stride{value};
+                int {var}_jump{value}_{j};
                 """
 
             else:
                 # if the dimension is broadcasted, we only need
                 # the jump (arbitrary length and stride = 0)
-                decl += f"""
-                int {var}_jump{value}_{int(j)};
-                """
+                decl += f"int {var}_jump{value}_{j};\n"
 
     return decl
 
@@ -39,7 +37,7 @@ def make_declare(loop_orders, dtypes, sub):
 def make_checks(loop_orders, dtypes, sub):
     init = ""
     for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-        var = f"%(lv{int(i)})s"
+        var = sub[f"lv{i}"]
         # List of dimensions of var that are not broadcasted
         nonx = [x for x in loop_order if x != "x"]
         if nonx:
@@ -47,12 +45,14 @@ def make_checks(loop_orders, dtypes, sub):
             # this is a check that the number of dimensions of the
             # tensor is as expected.
             min_nd = max(nonx) + 1
-            init += f"""
-            if (PyArray_NDIM({var}) < {min_nd}) {{
-                PyErr_SetString(PyExc_ValueError, "Not enough dimensions on input.");
-                %(fail)s
-            }}
-            """
+            init += dedent(
+                f"""
+                if (PyArray_NDIM({var}) < {min_nd}) {{
+                    PyErr_SetString(PyExc_ValueError, "Not enough dimensions on input.");
+                    {indent(sub["fail"], " " * 12)}
+                }}
+                """
+            )
 
         # In loop j, adjust represents the difference of values of the
         # data pointer between the beginning and the end of the
@@ -75,9 +75,7 @@ def make_checks(loop_orders, dtypes, sub):
                 adjust = f"{var}_n{index}*{var}_stride{index}"
             else:
                 jump = f"-({adjust})"
-                init += f"""
-                {var}_jump{index}_{j} = {jump};
-                """
+                init += f"{var}_jump{index}_{j} = {jump};\n"
                 adjust = "0"
     check = ""
 
@@ -101,34 +99,36 @@ def make_checks(loop_orders, dtypes, sub):
 
         j0, x0 = to_compare[0]
         for j, x in to_compare[1:]:
-            check += f"""
-            if (%(lv{j0})s_n{x0} != %(lv{j})s_n{x})
-            {{
-                if (%(lv{j0})s_n{x0} == 1 || %(lv{j})s_n{x} == 1)
+            check += dedent(
+                f"""
+                if ({sub[f"lv{j0}"]}_n{x0} != {sub[f"lv{j}"]}_n{x})
                 {{
-                    PyErr_Format(PyExc_ValueError, "{runtime_broadcast_error_msg}",
-                   {j0},
-                   {x0},
-                   (long long int) %(lv{j0})s_n{x0},
-                   {j},
-                   {x},
-                   (long long int) %(lv{j})s_n{x}
-                    );
-                }} else {{
-                    PyErr_Format(PyExc_ValueError, "Input dimension mismatch: (input[%%i].shape[%%i] = %%lld, input[%%i].shape[%%i] = %%lld)",
+                    if ({sub[f"lv{j0}"]}_n{x0} == 1 || {sub[f"lv{j}"]}_n{x} == 1)
+                    {{
+                        PyErr_Format(PyExc_ValueError, "{runtime_broadcast_error_msg}",
                        {j0},
                        {x0},
-                       (long long int) %(lv{j0})s_n{x0},
+                       (long long int) {sub[f"lv{j0}"]}_n{x0},
                        {j},
                        {x},
-                       (long long int) %(lv{j})s_n{x}
-                    );
+                       (long long int) {sub[f"lv{j}"]}_n{x}
+                        );
+                    }} else {{
+                        PyErr_Format(PyExc_ValueError, "Input dimension mismatch: (input[%%i].shape[%%i] = %%lld, input[%%i].shape[%%i] = %%lld)",
+                           {j0},
+                           {x0},
+                           (long long int) {sub[f"lv{j0}"]}_n{x0},
+                           {j},
+                           {x},
+                           (long long int) {sub[f"lv{j}"]}_n{x}
+                        );
+                    }}
+                    {sub["fail"]}
                 }}
-                %(fail)s
-            }}
-        """
+            """
+            )
 
-    return init % sub + check % sub
+    return init + check
 
 
 def compute_output_dims_lengths(array_name: str, loop_orders, sub) -> str:
@@ -144,7 +144,7 @@ def compute_output_dims_lengths(array_name: str, loop_orders, sub) -> str:
         # Borrow the length of the first non-broadcastable input dimension
         for j, candidate in enumerate(candidates):
             if candidate != "x":
-                var = sub[f"lv{int(j)}"]
+                var = sub[f"lv{j}"]
                 dims_c_code += f"{array_name}[{i}] = {var}_n{candidate};\n"
                 break
         # If none is non-broadcastable, the output dimension has a length of 1
@@ -177,35 +177,37 @@ def make_alloc(loop_orders, dtype, sub, fortran="0"):
     # way that its contiguous dimensions match one of the input's
     # contiguous dimensions, or the dimension with the smallest
     # stride. Right now, it is allocated to be C_CONTIGUOUS.
-    return f"""
-    {{
-        npy_intp dims[{nd}];
-        //npy_intp* dims = (npy_intp*)malloc({nd} * sizeof(npy_intp));
-        {init_dims}
-        if (!{olv}) {{
-            {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, dims,
-                                                    {type},
-                                                    {fortran});
-        }}
-        else {{
-            PyArray_Dims new_dims;
-            new_dims.len = {nd};
-            new_dims.ptr = dims;
-            PyObject* success = PyArray_Resize({olv}, &new_dims, 0, NPY_CORDER);
-            if (!success) {{
-                // If we can't resize the ndarray we have we can allocate a new one.
-                PyErr_Clear();
-                Py_XDECREF({olv});
-                {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, dims, {type}, 0);
-            }} else {{
-                Py_DECREF(success);
+    return dedent(
+        f"""
+        {{
+            npy_intp dims[{nd}];
+            {init_dims}
+            if (!{olv}) {{
+                {olv} = (PyArrayObject*)PyArray_EMPTY({nd},
+                                                      dims,
+                                                      {type},
+                                                      {fortran});
+            }}
+            else {{
+                PyArray_Dims new_dims;
+                new_dims.len = {nd};
+                new_dims.ptr = dims;
+                PyObject* success = PyArray_Resize({olv}, &new_dims, 0, NPY_CORDER);
+                if (!success) {{
+                    // If we can't resize the ndarray we have we can allocate a new one.
+                    PyErr_Clear();
+                    Py_XDECREF({olv});
+                    {olv} = (PyArrayObject*)PyArray_EMPTY({nd}, dims, {type}, 0);
+                }} else {{
+                    Py_DECREF(success);
+                }}
+            }}
+            if (!{olv}) {{
+                {fail}
             }}
         }}
-        if (!{olv}) {{
-            {fail}
-        }}
-    }}
-    """
+        """
+    )
 
 
 def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
@@ -235,11 +237,11 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
     """
 
     def loop_over(preloop, code, indices, i):
-        iterv = f"ITER_{int(i)}"
+        iterv = f"ITER_{i}"
         update = ""
         suitable_n = "1"
         for j, index in enumerate(indices):
-            var = sub[f"lv{int(j)}"]
+            var = sub[f"lv{j}"]
             dtype = dtypes[j]
             update += f"{dtype} &{var}_i = * ( {var}_iter + {iterv} * {var}_jump{index}_{i} );\n"
 
@@ -305,13 +307,13 @@ def make_reordered_loop(
     nnested = len(init_loop_orders[0])
 
     # This is the var from which we'll get the loop order
-    ovar = sub[f"lv{int(olv_index)}"]
+    ovar = sub[f"lv{olv_index}"]
 
     # The loops are ordered by (decreasing) absolute values of ovar's strides.
     # The first element of each pair is the absolute value of the stride
     # The second element correspond to the index in the initial loop order
     order_loops = f"""
-    std::vector< std::pair<int, int> > {ovar}_loops({int(nnested)});
+    std::vector< std::pair<int, int> > {ovar}_loops({nnested});
     std::vector< std::pair<int, int> >::iterator {ovar}_loops_it = {ovar}_loops.begin();
     """
 
@@ -319,7 +321,7 @@ def make_reordered_loop(
     for i, index in enumerate(init_loop_orders[olv_index]):
         if index != "x":
             order_loops += f"""
-            {ovar}_loops_it->first = abs(PyArray_STRIDES({ovar})[{int(index)}]);
+            {ovar}_loops_it->first = abs(PyArray_STRIDES({ovar})[{index}]);
             """
         else:
             # Stride is 0 when dimension is broadcastable
@@ -328,7 +330,7 @@ def make_reordered_loop(
             """
 
         order_loops += f"""
-        {ovar}_loops_it->second = {int(i)};
+        {ovar}_loops_it->second = {i};
         ++{ovar}_loops_it;
         """
 
@@ -352,7 +354,7 @@ def make_reordered_loop(
 
     for i in range(nnested):
         declare_totals += f"""
-        int TOTAL_{int(i)} = init_totals[{ovar}_loops_it->second];
+        int TOTAL_{i} = init_totals[{ovar}_loops_it->second];
         ++{ovar}_loops_it;
         """
 
@@ -365,7 +367,7 @@ def get_loop_strides(loop_order, i):
         specified loop_order.
 
         """
-        var = sub[f"lv{int(i)}"]
+        var = sub[f"lv{i}"]
         r = []
         for index in loop_order:
             # Note: the stride variable is not declared for broadcasted variables
@@ -383,7 +385,7 @@ def get_loop_strides(loop_order, i):
     )
 
     declare_strides = f"""
-    int init_strides[{int(nvars)}][{int(nnested)}] = {{
+    int init_strides[{nvars}][{nnested}] = {{
         {strides}
     }};"""
 
@@ -394,33 +396,33 @@ def get_loop_strides(loop_order, i):
     """
 
     for i in range(nvars):
-        var = sub[f"lv{int(i)}"]
+        var = sub[f"lv{i}"]
         declare_strides += f"""
         {ovar}_loops_rit = {ovar}_loops.rbegin();"""
         for j in reversed(range(nnested)):
             declare_strides += f"""
-            int {var}_stride_l{int(j)} = init_strides[{int(i)}][{ovar}_loops_rit->second];
+            int {var}_stride_l{j} = init_strides[{i}][{ovar}_loops_rit->second];
             ++{ovar}_loops_rit;
             """
 
     declare_iter = ""
     for i, dtype in enumerate(dtypes):
-        var = sub[f"lv{int(i)}"]
+        var = sub[f"lv{i}"]
         declare_iter += f"{var}_iter = ({dtype}*)(PyArray_DATA({var}));\n"
 
     pointer_update = ""
     for j, dtype in enumerate(dtypes):
-        var = sub[f"lv{int(j)}"]
+        var = sub[f"lv{j}"]
         pointer_update += f"{dtype} &{var}_i = * ( {var}_iter"
         for i in reversed(range(nnested)):
-            iterv = f"ITER_{int(i)}"
-            pointer_update += f"+{var}_stride_l{int(i)}*{iterv}"
+            iterv = f"ITER_{i}"
+            pointer_update += f"+{var}_stride_l{i}*{iterv}"
         pointer_update += ");\n"
 
     loop = inner_task
     for i in reversed(range(nnested)):
-        iterv = f"ITER_{int(i)}"
-        total = f"TOTAL_{int(i)}"
+        iterv = f"ITER_{i}"
+        total = f"TOTAL_{i}"
         update = ""
         forloop = ""
         # The pointers are defined only in the most inner loop
@@ -434,36 +436,14 @@ def get_loop_strides(loop_order, i):
 
         loop = f"""
         {forloop}
-        {{ // begin loop {int(i)}
+        {{ // begin loop {i}
             {update}
             {loop}
-        }} // end loop {int(i)}
+        }} // end loop {i}
         """
 
-    return f"{{\n{order_loops}\n{declare_totals}\n{declare_strides}\n{declare_iter}\n{loop}\n}}\n"
-
-
-# print make_declare(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)),
-#                    ('double', 'int', 'float'),
-#                    dict(lv0='x', lv1='y', lv2='z', fail="FAIL;"))
-
-# print make_checks(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)),
-#                   ('double', 'int', 'float'),
-#                   dict(lv0='x', lv1='y', lv2='z', fail="FAIL;"))
-
-# print make_alloc(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)),
-#                  'double',
-#                  dict(olv='out', lv0='x', lv1='y', lv2='z', fail="FAIL;"))
-
-# print make_loop(((0, 1, 2, 3), ('x', 1, 0, 3), ('x', 'x', 'x', 0)),
-#                 ('double', 'int', 'float'),
-#                 (("C00;", "C%01;"), ("C10;", "C11;"), ("C20;", "C21;"), ("C30;", "C31;"),"C4;"),
-#                 dict(lv0='x', lv1='y', lv2='z', fail="FAIL;"))
-
-# print make_loop(((0, 1, 2, 3), (3, 'x', 0, 'x'), (0, 'x', 'x', 'x')),
-#                 ('double', 'int', 'float'),
-#                 (("C00;", "C01;"), ("C10;", "C11;"), ("C20;", "C21;"), ("C30;", "C31;"),"C4;"),
-#                 dict(lv0='x', lv1='y', lv2='z', fail="FAIL;"))
+    code = "\n".join((order_loops, declare_totals, declare_strides, declare_iter, loop))
+    return f"{{\n{code}\n}}\n"
 
 
 ##################

From 1a944b7978b4e1f2d65f07220bc71126c6a2e8c7 Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Tue, 6 Aug 2024 16:17:49 +0200
Subject: [PATCH 2/3] Add benchmark test for CAReduce

---
 tests/tensor/test_elemwise.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/tensor/test_elemwise.py b/tests/tensor/test_elemwise.py
index 94e91821fa..284e8051a7 100644
--- a/tests/tensor/test_elemwise.py
+++ b/tests/tensor/test_elemwise.py
@@ -985,3 +985,29 @@ def test_CAReduce(self):
         assert isinstance(vect_node.op, Any)
         assert vect_node.op.axis == (1,)
         assert vect_node.inputs[0] is bool_tns
+
+
+@pytest.mark.parametrize(
+    "axis",
+    (0, 1, 2, (0, 1), (0, 2), (1, 2), None),
+    ids=lambda x: f"axis={x}",
+)
+@pytest.mark.parametrize(
+    "c_contiguous",
+    (True, False),
+    ids=lambda x: f"c_contiguous={x}",
+)
+def test_careduce_benchmark(axis, c_contiguous, benchmark):
+    N = 256
+    x_test = np.random.uniform(size=(N, N, N))
+    transpose_axis = (0, 1, 2) if c_contiguous else (2, 0, 1)
+
+    x = pytensor.shared(x_test, name="x", shape=x_test.shape)
+    out = x.transpose(transpose_axis).sum(axis=axis)
+    fn = pytensor.function([], out)
+
+    np.testing.assert_allclose(
+        fn(),
+        x_test.transpose(transpose_axis).sum(axis=axis),
+    )
+    benchmark(fn)

From 061ed3acdfb5eb14b0e04d9c504df3cb64194b4c Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Sun, 4 Aug 2024 19:17:05 +0200
Subject: [PATCH 3/3] CAReduce loop reordering C-impl

---
 pytensor/tensor/elemwise.py      | 202 ++++++++---------
 pytensor/tensor/elemwise_cgen.py | 373 +++++++++++++++++++++++++------
 pytensor/tensor/math.py          |  31 +--
 3 files changed, 418 insertions(+), 188 deletions(-)

diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
index de966f1a78..7b446bcfbf 100644
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
@@ -1,4 +1,5 @@
 from copy import copy
+from textwrap import dedent
 
 import numpy as np
 from numpy.core.numeric import normalize_axis_tuple
@@ -1466,15 +1467,16 @@ def infer_shape(self, fgraph, node, shapes):
             return ((),)
         return ([ishape[i] for i in range(node.inputs[0].type.ndim) if i not in axis],)
 
-    def _c_all(self, node, name, inames, onames, sub):
-        input = node.inputs[0]
-        output = node.outputs[0]
+    def _c_all(self, node, name, input_names, output_names, sub):
+        [inp] = node.inputs
+        [out] = node.outputs
+        ndim = inp.type.ndim
 
-        iname = inames[0]
-        oname = onames[0]
+        [inp_name] = input_names
+        [out_name] = output_names
 
-        idtype = input.type.dtype_specs()[1]
-        odtype = output.type.dtype_specs()[1]
+        inp_dtype = inp.type.dtype_specs()[1]
+        out_dtype = out.type.dtype_specs()[1]
 
         acc_dtype = getattr(self, "acc_dtype", None)
 
@@ -1482,100 +1484,97 @@ def _c_all(self, node, name, inames, onames, sub):
             if acc_dtype == "float16":
                 raise MethodNotDefined("no c_code for float16")
             acc_type = TensorType(shape=node.outputs[0].type.shape, dtype=acc_dtype)
-            adtype = acc_type.dtype_specs()[1]
+            acc_dtype = acc_type.dtype_specs()[1]
         else:
-            adtype = odtype
+            acc_dtype = out_dtype
 
         axis = self.axis
         if axis is None:
-            axis = list(range(input.type.ndim))
+            axis = list(range(inp.type.ndim))
 
         if len(axis) == 0:
+            # This is just an Elemwise cast operation
             # The acc_dtype is never a downcast compared to the input dtype
             # So we just need a cast to the output dtype.
-            var = pytensor.tensor.basic.cast(input, node.outputs[0].dtype)
-            if var is input:
-                var = Elemwise(scalar_identity)(input)
+            var = pytensor.tensor.basic.cast(inp, node.outputs[0].dtype)
+            if var is inp:
+                var = Elemwise(scalar_identity)(inp)
             assert var.dtype == node.outputs[0].dtype
-            return var.owner.op._c_all(var.owner, name, inames, onames, sub)
-
-        order1 = [i for i in range(input.type.ndim) if i not in axis]
-        order = order1 + list(axis)
+            return var.owner.op._c_all(var.owner, name, input_names, output_names, sub)
 
-        nnested = len(order1)
+        inp_dims = list(range(ndim))
+        non_reduced_dims = [i for i in inp_dims if i not in axis]
+        counter = iter(range(ndim))
+        acc_dims = ["x" if i in axis else next(counter) for i in range(ndim)]
 
-        sub = dict(sub)
-        for i, (input, iname) in enumerate(zip(node.inputs, inames)):
-            sub[f"lv{i}"] = iname
+        sub = sub.copy()
+        sub["lv0"] = inp_name
+        sub["lv1"] = out_name
+        sub["olv"] = out_name
 
-        decl = ""
-        if adtype != odtype:
+        if acc_dtype != out_dtype:
             # Create an accumulator variable different from the output
-            aname = "acc"
-            decl = acc_type.c_declare(aname, sub)
-            decl += acc_type.c_init(aname, sub)
+            acc_name = "acc"
+            setup = acc_type.c_declare(acc_name, sub) + acc_type.c_init(acc_name, sub)
         else:
             # the output is the accumulator variable
-            aname = oname
-
-        decl += cgen.make_declare([order], [idtype], sub)
-        checks = cgen.make_checks([order], [idtype], sub)
-
-        alloc = ""
-        i += 1
-        sub[f"lv{i}"] = oname
-        sub["olv"] = oname
-
-        # Allocate output buffer
-        alloc += cgen.make_declare(
-            [list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname)
-        )
-        alloc += cgen.make_alloc([order1], odtype, sub)
-        alloc += cgen.make_checks(
-            [list(range(nnested)) + ["x"] * len(axis)], [odtype], dict(sub, lv0=oname)
+            acc_name = out_name
+            setup = ""
+
+        # Define strides of input array
+        setup += cgen.make_declare(
+            [inp_dims], [inp_dtype], sub, compute_stride_jump=False
+        ) + cgen.make_checks([inp_dims], [inp_dtype], sub, compute_stride_jump=False)
+
+        # Define strides of output array and allocate it
+        out_sub = sub | {"lv0": out_name}
+        alloc = (
+            cgen.make_declare(
+                [acc_dims], [out_dtype], out_sub, compute_stride_jump=False
+            )
+            + cgen.make_alloc([non_reduced_dims], out_dtype, sub)
+            + cgen.make_checks(
+                [acc_dims], [out_dtype], out_sub, compute_stride_jump=False
+            )
         )
 
-        if adtype != odtype:
-            # Allocate accumulation buffer
-            sub[f"lv{i}"] = aname
-            sub["olv"] = aname
+        if acc_dtype != out_dtype:
+            # Define strides of accumulation buffer and allocate it
+            sub["lv1"] = acc_name
+            sub["olv"] = acc_name
 
-            alloc += cgen.make_declare(
-                [list(range(nnested)) + ["x"] * len(axis)],
-                [adtype],
-                dict(sub, lv0=aname),
-            )
-            alloc += cgen.make_alloc([order1], adtype, sub)
-            alloc += cgen.make_checks(
-                [list(range(nnested)) + ["x"] * len(axis)],
-                [adtype],
-                dict(sub, lv0=aname),
+            acc_sub = sub | {"lv0": acc_name}
+            alloc += (
+                cgen.make_declare(
+                    [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
+                )
+                + cgen.make_alloc([non_reduced_dims], acc_dtype, sub)
+                + cgen.make_checks(
+                    [acc_dims], [acc_dtype], acc_sub, compute_stride_jump=False
+                )
             )
 
         identity = self.scalar_op.identity
-
         if np.isposinf(identity):
-            if input.type.dtype in ("float32", "float64"):
+            if inp.type.dtype in ("float32", "float64"):
                 identity = "__builtin_inf()"
-            elif input.type.dtype.startswith("uint") or input.type.dtype == "bool":
+            elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
                 identity = "1"
             else:
-                identity = "NPY_MAX_" + str(input.type.dtype).upper()
+                identity = "NPY_MAX_" + str(inp.type.dtype).upper()
         elif np.isneginf(identity):
-            if input.type.dtype in ("float32", "float64"):
+            if inp.type.dtype in ("float32", "float64"):
                 identity = "-__builtin_inf()"
-            elif input.type.dtype.startswith("uint") or input.type.dtype == "bool":
+            elif inp.type.dtype.startswith("uint") or inp.type.dtype == "bool":
                 identity = "0"
             else:
-                identity = "NPY_MIN_" + str(input.type.dtype).upper()
+                identity = "NPY_MIN_" + str(inp.type.dtype).upper()
         elif identity is None:
             raise TypeError(f"The {self.scalar_op} does not define an identity.")
 
-        task0_decl = f"{adtype}& {aname}_i = *{aname}_iter;\n{aname}_i = {identity};"
-
-        task1_decl = f"{idtype}& {inames[0]}_i = *{inames[0]}_iter;\n"
+        initial_value = f"{acc_name}_i = {identity};"
 
-        task1_code = self.scalar_op.c_code(
+        inner_task = self.scalar_op.c_code(
             Apply(
                 self.scalar_op,
                 [
@@ -1588,44 +1587,45 @@ def _c_all(self, node, name, inames, onames, sub):
                 ],
             ),
             None,
-            [f"{aname}_i", f"{inames[0]}_i"],
-            [f"{aname}_i"],
+            [f"{acc_name}_i", f"{inp_name}_i"],
+            [f"{acc_name}_i"],
             sub,
         )
-        code1 = f"""
-        {{
-            {task1_decl}
-            {task1_code}
-        }}
-        """
 
-        if node.inputs[0].type.ndim:
-            if len(axis) == 1:
-                all_code = [("", "")] * nnested + [(task0_decl, code1), ""]
-            else:
-                all_code = (
-                    [("", "")] * nnested
-                    + [(task0_decl, "")]
-                    + [("", "")] * (len(axis) - 2)
-                    + [("", code1), ""]
-                )
+        if out.type.ndim == 0:
+            # Simple case where everything is reduced, no need for loop ordering
+            loop = cgen.make_complete_loop_careduce(
+                inp_var=inp_name,
+                acc_var=acc_name,
+                inp_dtype=inp_dtype,
+                acc_dtype=acc_dtype,
+                initial_value=initial_value,
+                inner_task=inner_task,
+                fail_code=sub["fail"],
+            )
         else:
-            all_code = [task0_decl + code1]
-        loop = cgen.make_loop_careduce(
-            [order, list(range(nnested)) + ["x"] * len(axis)],
-            [idtype, adtype],
-            all_code,
-            sub,
-        )
+            loop = cgen.make_reordered_loop_careduce(
+                inp_var=inp_name,
+                acc_var=acc_name,
+                inp_dtype=inp_dtype,
+                acc_dtype=acc_dtype,
+                inp_ndim=ndim,
+                reduction_axes=axis,
+                initial_value=initial_value,
+                inner_task=inner_task,
+            )
 
-        end = ""
-        if adtype != odtype:
-            end = f"""
-            PyArray_CopyInto({oname}, {aname});
-            """
-            end += acc_type.c_cleanup(aname, sub)
+        if acc_dtype != out_dtype:
+            cast = dedent(
+                f"""
+                PyArray_CopyInto({out_name}, {acc_name});
+                {acc_type.c_cleanup(acc_name, sub)}
+                """
+            )
+        else:
+            cast = ""
 
-        return decl, checks, alloc, loop, end
+        return setup, alloc, loop, cast
 
     def c_code(self, node, name, inames, onames, sub):
         code = "\n".join(self._c_all(node, name, inames, onames, sub))
@@ -1637,7 +1637,7 @@ def c_headers(self, **kwargs):
 
     def c_code_cache_version_apply(self, node):
         # the version corresponding to the c code in this Op
-        version = [9]
+        version = [10]
 
         # now we insert versions for the ops on which we depend...
         scalar_node = Apply(
diff --git a/pytensor/tensor/elemwise_cgen.py b/pytensor/tensor/elemwise_cgen.py
index 397285bc82..7eb422aa0a 100644
--- a/pytensor/tensor/elemwise_cgen.py
+++ b/pytensor/tensor/elemwise_cgen.py
@@ -1,9 +1,10 @@
+from collections.abc import Sequence
 from textwrap import dedent, indent
 
 from pytensor.configdefaults import config
 
 
-def make_declare(loop_orders, dtypes, sub):
+def make_declare(loop_orders, dtypes, sub, compute_stride_jump=True):
     """
     Produce code to declare all necessary variables.
 
@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub):
                 # the number of elements in that dimension,
                 # the stride in that dimension,
                 # and the jump from an iteration to the next
-                decl += f"""
-                npy_intp {var}_n{value};
-                ssize_t {var}_stride{value};
-                int {var}_jump{value}_{j};
-                """
+                decl += f"npy_intp {var}_n{value};\nssize_t {var}_stride{value};\n"
+                if compute_stride_jump:
+                    decl += f"int {var}_jump{value}_{j};\n"
 
-            else:
+            elif compute_stride_jump:
                 # if the dimension is broadcasted, we only need
                 # the jump (arbitrary length and stride = 0)
                 decl += f"int {var}_jump{value}_{j};\n"
@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub):
     return decl
 
 
-def make_checks(loop_orders, dtypes, sub):
+def make_checks(loop_orders, dtypes, sub, compute_stride_jump=True):
     init = ""
     for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
         var = sub[f"lv{i}"]
@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub):
                 # Initialize the variables associated to the jth loop
                 # jump = stride - adjust
                 jump = f"({var}_stride{index}) - ({adjust})"
-                init += f"""
-                {var}_n{index} = PyArray_DIMS({var})[{index}];
-                {var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});
-                {var}_jump{index}_{j} = {jump};
-                """
+                init += f"{var}_n{index} = PyArray_DIMS({var})[{index}];\n"
+                init += f"{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});\n"
+                if compute_stride_jump:
+                    init += f"{var}_jump{index}_{j} = {jump};\n"
                 adjust = f"{var}_n{index}*{var}_stride{index}"
-            else:
+
+            elif compute_stride_jump:
                 jump = f"-({adjust})"
                 init += f"{var}_jump{index}_{j} = {jump};\n"
                 adjust = "0"
@@ -460,72 +459,298 @@ def get_loop_strides(loop_order, i):
 ################
 
 
-def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
+def make_complete_loop_careduce(
+    inp_var: str,
+    acc_var: str,
+    inp_dtype: str,
+    acc_dtype: str,
+    initial_value: str,
+    inner_task: str,
+    fail_code,
+) -> str:
+    """Generate C code for a complete reduction loop.
+
+    The generated code for a float64 input variable `inp` and accumulation variable `acc` looks like:
+
+    .. code-block:: C
+        {
+            NpyIter* iter;
+            NpyIter_IterNextFunc *iternext;
+            char** data_ptr;
+            npy_intp* stride_ptr,* innersize_ptr;
+
+            // Special case for empty inputs
+            if (PyArray_SIZE(inp) == 0) {
+                npy_float64 acc_i = *(npy_float64*)(PyArray_DATA(acc));
+                acc_i = 0;
+            }else{
+                iter = NpyIter_New(inp,
+                                   NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
+                                   NPY_KEEPORDER,
+                                   NPY_NO_CASTING,
+                                   NULL);
+                iternext = NpyIter_GetIterNext(iter, NULL);
+                if (iternext == NULL) {
+                    NpyIter_Deallocate(iter);
+                    { fail }
+                }
+                data_ptr = NpyIter_GetDataPtrArray(iter);
+                stride_ptr = NpyIter_GetInnerStrideArray(iter);
+                innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+                npy_float64 acc_i;
+                acc_i = 0;
+                do {
+                    char* data = *data_ptr;
+                    npy_intp stride = *stride_ptr;
+                    npy_intp count = *innersize_ptr;
+
+                    while(count--) {
+                        npy_float64 inp_i = *((npy_float64*)data);
+                        acc_i = acc_i + inp_i;
+                        data += stride;
+                    }
+
+                } while(iternext(iter));
+                NpyIter_Deallocate(iter);
+
+                *(npy_float64*)(PyArray_DATA(acc)) = acc_i;
+            }
+        }
     """
-    Make a nested loop over several arrays and associate specific code
-    to each level of nesting.
+    return dedent(
+        f"""
+        {{
+            NpyIter* iter;
+            NpyIter_IterNextFunc *iternext;
+            char** data_ptr;
+            npy_intp* stride_ptr,* innersize_ptr;
+
+            // Special case for empty inputs
+            if (PyArray_SIZE({inp_var}) == 0) {{
+                {acc_dtype} &{acc_var}_i = *({acc_dtype}*)(PyArray_DATA({acc_var}));
+                {initial_value}
+            }}else{{
+                iter = NpyIter_New({inp_var},
+                                   NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
+                                   NPY_KEEPORDER,
+                                   NPY_NO_CASTING,
+                                   NULL);
+
+                iternext = NpyIter_GetIterNext(iter, NULL);
+                if (iternext == NULL) {{
+                    NpyIter_Deallocate(iter);
+                    {fail_code}
+                }}
 
-    Parameters
-    ----------
-    loop_orders : list of N tuples of length M
-        Each value of each tuple can be either the index of a dimension to
-        loop over or the letter 'x' which means there is no looping to be done
-        over that variable at that point (in other words we broadcast
-        over that dimension). If an entry is an integer, it will become
-        an alias of the entry of that rank.
-    loop_tasks : list of M+1 pieces of code
-        The ith loop_task is a pair of strings, the first
-        string is code to be executed before the ith loop starts, the second
-        one contains code to be executed just before going to the next element
-        of the ith dimension.
-        The last element if loop_tasks is a single string, containing code
-        to be executed at the very end.
-    sub: dictionary
-        Maps 'lv#' to a suitable variable name.
-        The 'lvi' variable corresponds to the ith element of loop_orders.
+                data_ptr = NpyIter_GetDataPtrArray(iter);
+                stride_ptr = NpyIter_GetInnerStrideArray(iter);
+                innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-    """
+                {acc_dtype} {acc_var}_i;
+                {initial_value}
 
-    def loop_over(preloop, code, indices, i):
-        iterv = f"ITER_{int(i)}"
-        update = ""
-        suitable_n = "1"
-        for j, index in enumerate(indices):
-            var = sub[f"lv{int(j)}"]
-            update += f"{var}_iter += {var}_jump{index}_{i};\n"
-            if index != "x":
-                suitable_n = f"{var}_n{index}"
-        return f"""
-        {preloop}
-        for (int {iterv} = {suitable_n}; {iterv}; {iterv}--) {{
-            {code}
-            {update}
+                do {{
+                    char* data = *data_ptr;
+                    npy_intp stride = *stride_ptr;
+                    npy_intp count = *innersize_ptr;
+
+                    while(count--) {{
+                        {inp_dtype} {inp_var}_i = *(({inp_dtype}*)data);
+                        {inner_task}
+                        data += stride;
+                    }}
+                }} while(iternext(iter));
+
+                NpyIter_Deallocate(iter);
+                *({acc_dtype}*)(PyArray_DATA({acc_var})) = {acc_var}_i;
+            }}
         }}
         """
+    )
 
-    preloops = {}
-    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-        for j, index in enumerate(loop_order):
-            if index != "x":
-                preloops.setdefault(j, "")
-                preloops[j] += (
-                    f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
-                ) % sub
-                break
-        else:  # all broadcastable
-            preloops.setdefault(0, "")
-            preloops[0] += (
-                f"%(lv{i})s_iter = ({dtype}*)(PyArray_DATA(%(lv{i})s));\n"
-            ) % sub
 
-    if len(loop_tasks) == 1:
-        s = preloops.get(0, "")
-    else:
-        s = ""
-        for i, (pre_task, task), indices in reversed(
-            list(zip(range(len(loop_tasks) - 1), loop_tasks, list(zip(*loop_orders))))
-        ):
-            s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
+def make_reordered_loop_careduce(
+    inp_var: str,
+    acc_var: str,
+    inp_dtype: str,
+    acc_dtype: str,
+    inp_ndim: int,
+    reduction_axes: Sequence[int],
+    initial_value: str,
+    inner_task: str,
+) -> str:
+    """Generate C code for a partial reduction loop, reordering for optimal memory access of the input variable.
+
+    The generated code for a sum along the last axis of a 2D float64 input variable `inp`
+    in an accumulation variable `acc` looks like:
+
+    .. code-block:: C
+        {
+            // Special case for empty inputs
+            if (PyArray_SIZE(inp) == 0) {
+                acc_iter = (npy_float64*)(PyArray_DATA(acc));
+                int_n =  PyArray_SIZE(acc);
+                for(int i = 0; i < n; i++)
+                {
+                    npy_float64 &acc_i = acc_iter[i];
+                    acc_i = 0;
+                }
+            } else {
+            std::vector< std::pair<int, int> > loops(2);
+            std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
+
+            loops_it->first = abs(PyArray_STRIDES(inp)[0]);
+            loops_it->second = 0;
+            ++loops_it;
+            loops_it->first = abs(PyArray_STRIDES(inp)[1]);
+            loops_it->second = 1;
+            ++loops_it;
+            std::sort(loops.rbegin(), loops.rend());
+
+            int dim_lengths[2] = {inp_n0, inp_n1};
+            int inp_strides[2] = {inp_stride0, inp_stride1};
+            int acc_strides[2] = {acc_stride0, 0};
+            bool reduction_axes[2] = {0, 1};
+
+            loops_it = loops.begin();
+            int dim_length_0 = dim_lengths[loops_it->second];
+            int is_reduction_axis_0 = reduction_axes[loops_it->second];
+            int inp_stride_0 = inp_strides[loops_it->second];
+            int acc_stride_0 = acc_strides[loops_it->second];
+            ++loops_it;
+            int dim_length_1 = dim_lengths[loops_it->second];
+            int is_reduction_axis_1 = reduction_axes[loops_it->second];
+            int inp_stride_1 = inp_strides[loops_it->second];
+            int acc_stride_1 = acc_strides[loops_it->second];
+            ++loops_it;
+
+            inp_iter = (npy_float64*)(PyArray_DATA(inp));
+            acc_iter = (npy_float64*)(PyArray_DATA(acc));
+
+            for(int iter_0 = 0; iter_0<dim_length_0; iter_0++){
+                for(int iter_1 = 0; iter_1<dim_length_1; iter_1++){
+                    npy_float64 &inp_i = *(inp_iter + inp_stride_1*iter_1 + inp_stride_0*iter_0);
+                    npy_float64 &acc_i = *(acc_iter + acc_stride_1*iter_1 + acc_stride_0*iter_0);
+
+                    if((!is_reduction_axis_0 || iter_0 == 0) && (!is_reduction_axis_1 || iter_1 == 0))
+                    {
+                        acc_i = 0;
+                    }
+                    {acc_i = acc_i + inp_i;}
+                }
+            }
+        }
 
-    s += loop_tasks[-1]
-    return f"{{{s}}}"
+    """
+
+    empty_case = dedent(
+        f"""
+        // Special case for empty inputs
+        if (PyArray_SIZE({inp_var}) == 0) {{
+            {acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
+            int n =  PyArray_SIZE({acc_var});
+            for(int i = 0; i < n; i++)
+            {{
+                {acc_dtype} &{acc_var}_i = {acc_var}_iter[i];
+                {initial_value}
+            }}
+        }} else {{
+        """
+    )
+
+    # The loops are ordered by (decreasing) absolute values of inp_var's strides.
+    # The first element of each pair is the absolute value of the stride
+    # The second element correspond to the index in the initial loop order
+    order_loops = dedent(
+        f"""
+        std::vector< std::pair<int, int> > loops({inp_ndim});
+        std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
+        """
+    )
+
+    # Fill the loop vector with the appropriate <stride, index> pairs
+    for i in range(inp_ndim):
+        order_loops += dedent(
+            f"""
+            loops_it->first = abs(PyArray_STRIDES({inp_var})[{i}]);
+            loops_it->second = {i};
+            ++loops_it;"""
+        )
+
+    # We sort in decreasing order so that the outermost loop (loop 0)
+    # has the largest stride, and the innermost loop has the smallest stride.
+    order_loops += "\nstd::sort(loops.rbegin(), loops.rend());\n"
+
+    # Sort shape and strides to match the new order that was computed by sorting the loop vector.
+    counter = iter(range(inp_ndim))
+    unsorted_vars = dedent(
+        f"""
+        int dim_lengths[{inp_ndim}] = {{{','.join(f'{inp_var}_n{i}' for i in range(inp_ndim))}}};
+        int inp_strides[{inp_ndim}] = {{{','.join(f'{inp_var}_stride{i}' for i in range(inp_ndim))}}};
+        int acc_strides[{inp_ndim}] = {{{','.join("0" if i in reduction_axes else f'{acc_var}_stride{next(counter)}'for i in range(inp_ndim))}}};
+        bool reduction_axes[{inp_ndim}] = {{{', '.join("1" if i in reduction_axes else "0" for i in range(inp_ndim))}}};\n
+        """
+    )
+
+    sorted_vars = "loops_it = loops.begin();"
+    for i in range(inp_ndim):
+        sorted_vars += dedent(
+            f"""
+            int dim_length_{i} = dim_lengths[loops_it->second];
+            int is_reduction_axis_{i} = reduction_axes[loops_it->second];
+            int {inp_var}_stride_{i} = inp_strides[loops_it->second];
+            int {acc_var}_stride_{i} = acc_strides[loops_it->second];
+            ++loops_it;
+            """
+        )
+
+    declare_iter = dedent(
+        f"""
+        {inp_var}_iter = ({inp_dtype}*)(PyArray_DATA({inp_var}));
+        {acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
+        """
+    )
+
+    pointer_update = ""
+    for var, dtype in ((inp_var, inp_dtype), (acc_var, acc_dtype)):
+        pointer_update += f"{dtype} &{var}_i = *({var}_iter"
+        for i in reversed(tuple(range(inp_ndim))):
+            iter_var = f"iter_{i}"
+            pointer_update += f" + {var}_stride_{i}*{iter_var}"
+        pointer_update += ");\n"
+
+    # Set initial value in first iteration of each output
+    # This happens on the first iteration of every reduction axis
+    initial_iteration = " && ".join(
+        f"(!is_reduction_axis_{i} || iter_{i} == 0)" for i in range(inp_ndim)
+    )
+    set_initial_value = dedent(
+        f"""
+        if({initial_iteration})
+        {{
+            {initial_value}
+        }}
+        """
+    )
+
+    # We set do pointer_update, initial_value and inner task in inner loop
+    loop = "\n\n".join((pointer_update, set_initial_value, f"{{{inner_task}}}"))
+
+    # Create outer loops recursively
+    for i in reversed(range(inp_ndim)):
+        iter_var = f"iter_{i}"
+        dim_length = f"dim_length_{i}"
+        loop = dedent(
+            f"""
+            for(int {iter_var} = 0; {iter_var}<{dim_length}; {iter_var}++){{
+                {loop}
+            }}
+            """
+        )
+
+    non_empty_case = "\n".join(
+        (order_loops, unsorted_vars, sorted_vars, declare_iter, loop)
+    )
+    code = "\n".join((empty_case, non_empty_case, "}"))
+    return f"{{\n{code}\n}}\n"
diff --git a/pytensor/tensor/math.py b/pytensor/tensor/math.py
index 8619b124be..86df161fb6 100644
--- a/pytensor/tensor/math.py
+++ b/pytensor/tensor/math.py
@@ -1,6 +1,7 @@
 import builtins
 import warnings
 from collections.abc import Sequence
+from textwrap import dedent
 from typing import TYPE_CHECKING, Optional
 
 import numpy as np
@@ -361,12 +362,14 @@ def __str__(self):
 
 
 class NonZeroDimsCAReduce(FixedOpCAReduce):
-    def _c_all(self, node, name, inames, onames, sub):
-        decl, checks, alloc, loop, end = super()._c_all(node, name, inames, onames, sub)
+    def _c_all(self, node, name, input_names, output_names, sub):
+        setup, alloc, loop, cast = super()._c_all(
+            node, name, input_names, output_names, sub
+        )
 
         # We add an additional check for zero-sized dimensions (This seems like
         # something that could enabled in `elemwise_cgen.make_checks`.)
-        iname = inames[0]
+        [iname] = input_names
 
         axis = self.axis
         if axis is None:
@@ -378,17 +381,19 @@ def _c_all(self, node, name, inames, onames, sub):
 
         pattern_ = str(pattern)[1:-1]
 
-        decl += f"""int tosum[]={{{pattern_}}};"""
-        alloc += f"""
-                for(int i=0;i<PyArray_NDIM({iname});i++){{
-                    if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
-                        PyErr_Format(PyExc_ValueError,
-                            "Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
-                        {sub["fail"]};
-                    }}
+        setup = f"int tosum[]={{{pattern_}}};" + setup
+        alloc += dedent(
+            f"""
+            for(int i=0;i<PyArray_NDIM({iname});i++){{
+                if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
+                    PyErr_Format(PyExc_ValueError,
+                        "Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis %%d",i);
+                    {sub["fail"]};
                 }}
-                """
-        return decl, checks, alloc, loop, end
+            }}
+            """
+        )
+        return setup, alloc, loop, cast
 
 
 class Max(NonZeroDimsCAReduce):